pax_global_header00006660000000000000000000000064151567510750014526gustar00rootroot0000000000000052 comment=36ad04841c209cb8b3577ec2723d431b6bf7efa0 libde265-1.0.18/000077500000000000000000000000001515675107500131315ustar00rootroot00000000000000libde265-1.0.18/AUTHORS000066400000000000000000000003251515675107500142010ustar00rootroot00000000000000Authors of libde265 See also the files THANKS and ChangeLog Dirk Farin - designed and implemented libde265 Joachim Bauch - bugfixes, optimizations and support for Windows libde265-1.0.18/CMakeLists.txt000066400000000000000000000133421515675107500156740ustar00rootroot00000000000000cmake_minimum_required (VERSION 3.16.3) project (libde265 LANGUAGES C CXX VERSION 1.0.18 ) # Auto-compute BCD-encoded numeric version from project version. # Each component is BCD-encoded: decimal 16 → 0x16. # Result: 1.0.16 → 0x01001600 math(EXPR _maj_bcd "(${PROJECT_VERSION_MAJOR}/10)*16 + (${PROJECT_VERSION_MAJOR}%10)") math(EXPR _min_bcd "(${PROJECT_VERSION_MINOR}/10)*16 + (${PROJECT_VERSION_MINOR}%10)") math(EXPR _pat_bcd "(${PROJECT_VERSION_PATCH}/10)*16 + (${PROJECT_VERSION_PATCH}%10)") math(EXPR NUMERIC_VERSION "${_maj_bcd} * 16777216 + ${_min_bcd} * 65536 + ${_pat_bcd} * 256" OUTPUT_FORMAT HEXADECIMAL) set (PACKAGE_VERSION ${PROJECT_VERSION}) # --- Shared library version (ABI versioning, independent of project version) --- # # This controls the shared library filename: # libde265.so -> libde265.so.0 -> libde265.so.0.1.9 # ^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^ # SOVERSION VERSION # # VERSION is a three-part number: MAJOR.MINOR.PATCH # - MAJOR = the ABI major version (same as SOVERSION). # Bump when the ABI breaks (exported functions removed or signatures changed). # Reset MINOR and PATCH to 0. # - MINOR = backward-compatible ABI additions. # Bump when new exported functions are added but old ones still work. # Reset PATCH to 0. # - PATCH = implementation-only changes (bug fixes, performance improvements). # Bump when the ABI is unchanged. # # SOVERSION must always equal the MAJOR part of VERSION. # Programs linked against libde265.so.0 will work with any libde265.so.0.x.y. # set(DE265_SOVERSION 0) set(DE265_LIBRARY_VERSION "0.1.11") set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_compile_options( "$<$,$>>:-Wall;-Wextra;-Wpedantic;-Wno-unused-parameter>" "$<$,$>:/W4>" ) option(USE_IWYU "Run include-what-you-use analysis during build" OFF) if (USE_IWYU) find_program(IWYU_PATH include-what-you-use) if (IWYU_PATH) set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE ${IWYU_PATH}) set(CMAKE_C_INCLUDE_WHAT_YOU_USE ${IWYU_PATH}) else() message(WARNING "include-what-you-use not found, USE_IWYU ignored") endif() endif() include (${CMAKE_ROOT}/Modules/CheckCCompilerFlag.cmake) include (${CMAKE_ROOT}/Modules/CheckIncludeFile.cmake) include(GNUInstallDirs) include(CheckFunctionExists) option(ENABLE_SDL "Enable SDL" ON) if (ENABLE_SDL) find_package(SDL2) endif() CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H) CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) include(CheckCSourceCompiles) check_c_source_compiles( "#if !defined(__x86_64) && !defined(__i386__) \ && !defined(_M_IX86) && !defined(_M_AMD64) \ || defined(_M_ARM64EC) || defined(_ARM64EC_) #error not x86 #endif int main(){return 0;}" HAVE_X86) if(HAVE_X86) if (MSVC) set(SUPPORTS_SSE2 1) set(SUPPORTS_SSSE3 1) set(SUPPORTS_SSE4_1 1) else() check_c_compiler_flag(-msse2 SUPPORTS_SSE2) check_c_compiler_flag(-mssse3 SUPPORTS_SSSE3) check_c_compiler_flag(-msse4.1 SUPPORTS_SSE4_1) endif() if(SUPPORTS_SSE4_1) set(HAVE_SSE4_1 TRUE) endif() endif() check_c_source_compiles( "#if !defined(__arm__) && !defined(__aarch64__) \ && !defined(_M_ARM) && !defined(_M_ARM64) #error not ARM #endif int main(){return 0;}" HAVE_ARM) if(HAVE_ARM) enable_language(ASM) check_c_compiler_flag(-mfpu=neon HAVE_NEON) endif() configure_file (libde265/de265-version.h.in libde265/de265-version.h) configure_file (cmake/config.h.in config.h) add_definitions(-DHAVE_CONFIG_H) if(CMAKE_COMPILER_IS_GNUCXX OR ${CMAKE_CXX_COMPILER_ID} MATCHES Clang) add_definitions(-Wall -Werror=return-type -Werror=unused-result -Werror=reorder) endif() include(CheckCXXSymbolExists) check_cxx_symbol_exists(_LIBCPP_VERSION cstdlib HAVE_LIBCPP) if(HAVE_LIBCPP) set(LIBS_PRIVATE "-lc++") else() set(LIBS_PRIVATE "-lstdc++") endif() option(BUILD_SHARED_LIBS "Build shared library" ON) if(NOT BUILD_SHARED_LIBS) add_definitions(-DLIBDE265_STATIC_BUILD) endif() if(APPLE) option(BUILD_FRAMEWORK "Build as Apple Frameworks" OFF) endif() include_directories ("${PROJECT_BINARY_DIR}") include_directories ("${PROJECT_SOURCE_DIR}") include_directories ("${PROJECT_SOURCE_DIR}/libde265") if(MSVC) include_directories ("${PROJECT_SOURCE_DIR}/extra") endif() find_package(Threads) option(ENABLE_DECODER "Enable Decoder" ON) option(ENABLE_ENCODER "Enable Encoder" OFF) option(ENABLE_SHERLOCK265 "Build sherlock265 visual inspection tool" OFF) option(ENABLE_INTERNAL_DEVELOPMENT_TOOLS "Build internal development tools (not for end users)" OFF) option(WITH_FUZZERS "Build the fuzzers" OFF) set(FUZZING_SANITIZER_OPTIONS "-fsanitize=address,shift,integer" "-fno-sanitize=unsigned-shift-base" "-fno-sanitize-recover=shift,integer" CACHE STRING "Sanitizer flags for fuzzing builds") if (WITH_FUZZERS) add_compile_options("-fsanitize=fuzzer-no-link" ${FUZZING_SANITIZER_OPTIONS}) add_link_options(${FUZZING_SANITIZER_OPTIONS}) endif() add_subdirectory (libde265) if (ENABLE_DECODER) add_subdirectory (dec265) endif() if (ENABLE_ENCODER) add_subdirectory (enc265) endif() if (ENABLE_SHERLOCK265) add_subdirectory (sherlock265) endif() if (ENABLE_INTERNAL_DEVELOPMENT_TOOLS) add_subdirectory (dev-tools) endif() if (WITH_FUZZERS) add_subdirectory (fuzzing) endif() add_custom_target(dist COMMAND git archive --format=tar.gz --worktree-attributes --prefix=${PROJECT_NAME}-${PROJECT_VERSION}/ -o ${CMAKE_BINARY_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz HEAD WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} COMMENT "Creating source tarball ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz" ) libde265-1.0.18/CMakePresets.json000066400000000000000000000020031515675107500163450ustar00rootroot00000000000000{ "version": 6, "configurePresets": [ { "name": "release", "displayName": "Release", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release", "ENABLE_ENCODER": "OFF" } }, { "name": "fuzzing", "displayName": "Fuzzing (libFuzzer)", "binaryDir": "${sourceDir}/build-fuzzing", "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "CMAKE_C_COMPILER": "clang", "CMAKE_CXX_COMPILER": "clang++", "BUILD_SHARED_LIBS": "OFF", "WITH_FUZZERS": "ON", "ENABLE_ENCODER": "OFF", "ENABLE_SDL": "OFF" } }, { "name": "afl", "displayName": "Fuzzing (AFL++)", "binaryDir": "${sourceDir}/build-afl", "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "CMAKE_C_COMPILER": "afl-cc", "CMAKE_CXX_COMPILER": "afl-c++", "BUILD_SHARED_LIBS": "OFF", "ENABLE_DECODER": "ON", "ENABLE_ENCODER": "OFF", "ENABLE_SDL": "OFF" } } ] } libde265-1.0.18/COPYING000066400000000000000000001264631515675107500142000ustar00rootroot00000000000000* The library `libde265` is distributed under the terms of the GNU Lesser General Public License. * The sample applications are distributed under the terms of the MIT license. License texts below and in the `COPYING` files of the corresponding subfolders. ---------------------------------------------------------------------- GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. ---------------------------------------------------------------------- GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ---------------------------------------------------------------------- MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. libde265-1.0.18/ChangeLog000066400000000000000000000001041515675107500146760ustar00rootroot00000000000000See https://github.com/strukturag/libde265 for further information. libde265-1.0.18/NEWS000066400000000000000000000001041515675107500136230ustar00rootroot00000000000000See https://github.com/strukturag/libde265 for further information. libde265-1.0.18/README000066400000000000000000000000471515675107500140120ustar00rootroot00000000000000See README.md for further information. libde265-1.0.18/README.md000066400000000000000000000133451515675107500144160ustar00rootroot00000000000000 libde265 - open h.265 codec implementation ========================================== ![libde265](libde265.png) libde265 is an open source implementation of the h.265 video codec. It is written from scratch and has a plain C API to enable a simple integration into other software. libde265 supports WPP and tile-based multithreading and includes SSE optimizations. The decoder includes all features of the Main profile and correctly decodes almost all conformance streams (see [[wiki page](https://github.com/strukturag/libde265/wiki/Decoder-conformance)]). A list of supported features are available in the [wiki](https://github.com/strukturag/libde265/wiki/Supported-decoding-features). For latest news check our website at http://www.libde265.org The library comes with two example programs: - dec265, a simple player for raw h.265 bitstreams. It serves nicely as an example program how to use libde265. - sherlock265, a Qt-based video player with the additional capability to overlay some graphical representations of the h.265 bitstream (like CU-trees, intra-prediction modes). Example bitstreams can be found, e.g., at this site: ftp://ftp.kw.bbc.co.uk/hevc/hm-10.1-anchors/bitstreams/ra_main/ Approximate performance for WPP, non-tiles streams (measured using the `timehevc` tool from [the GStreamer plugin](https://github.com/strukturag/gstreamer-libde265)). The tool plays a Matroska movie to the GStreamer fakesink and measures the average framerate. | Resolution | avg. fps | CPU usage | | ----------------- | -------- | --------- | | [720p][1] | 284 fps | 39 % | | [1080p][2] | 150 fps | 45 % | | [4K][3] | 36 fps | 56 % | Environment: - Intel(R) Core(TM) i7-2700K CPU @ 3.50GHz (4 physical CPU cores) - Ubuntu 12.04, 64bit - GStreamer 0.10.36 [1]: http://trailers.divx.com/hevc/TearsOfSteel_720p_24fps_27qp_831kbps_720p_GPSNR_41.65_HM11_2aud_7subs.mkv [2]: http://trailers.divx.com/hevc/TearsOfSteel_1080p_24fps_27qp_1474kbps_GPSNR_42.29_HM11_2aud_7subs.mkv [3]: http://trailers.divx.com/hevc/TearsOfSteel_4K_24fps_9500kbps_2aud_9subs.mkv Building ======== [![Build Status](https://github.com/strukturag/libde265/workflows/build/badge.svg)](https://github.com/strukturag/libde265/actions) [![Build Status](https://ci.appveyor.com/api/projects/status/github/strukturag/libde265?svg=true)](https://ci.appveyor.com/project/strukturag/libde265) libde265 uses the CMake build system. To compile libde265, run ```` mkdir build cd build cmake .. make ```` libde265 has no dependencies on other libraries, but both optional example programs have dependencies on: - SDL2 (optional for dec265's YUV overlay output), - Qt (required for sherlock265), - libswscale (required for sherlock265 if libvideogfx is not available). - libvideogfx (required for sherlock265 if libswscale is not available, optional for dec265). Libvideogfx can be obtained from http://www.dirk-farin.net/software/libvideogfx/index.html or http://github.com/farindk/libvideogfx Build using cmake ================= cmake scripts to build libde265 and the sample scripts `dec265` and `enc265` are included and can be compiled using these commands: ``` mkdir build cd build cmake .. make ``` You can disable building of the example programs by running `cmake` with
  -DENABLE_DECODER=off     Do not build the dec265 decoder program.
  -DENABLE_SHERLOCK265=off Do not build the sherlock265 visual inspection program.
Additional logging information can be turned on and off using these `./configure` flags:
  -DDE265_LOG_LEVEL={error;info;debug;trace}
Building using vcpkg ==================== You can build and install libde265 using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager: ``` git clone https://github.com/Microsoft/vcpkg.git cd vcpkg ./bootstrap-vcpkg.sh ./vcpkg integrate install ./vcpkg install libde265 ``` The libde265 port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository. Prebuilt binaries ================= Binary packages can be obtained from this [launchpad site](https://launchpad.net/~strukturag/+archive/libde265). Software using libde265 ======================= Libde265 has been integrated into these applications: - libheif [source](https://github.com/strukturag/libheif) - gstreamer plugin, [source](https://github.com/strukturag/gstreamer-libde265), [binary packages](https://launchpad.net/~strukturag/+archive/libde265). - VLC plugin [source](https://github.com/strukturag/vlc-libde265), [binary packages](https://launchpad.net/~strukturag/+archive/libde265). - Windows DirectShow filters, https://github.com/strukturag/LAVFilters/releases - ffmpeg fork, https://github.com/farindk/ffmpeg - ffmpeg decoder [source](https://github.com/strukturag/libde265-ffmpeg) - libde265.js JavaScript decoder [source](https://github.com/strukturag/libde265.js), [demo](https://strukturag.github.io/libde265.js/). ## Packaging status [![libde265 packaging status](https://repology.org/badge/vertical-allrepos/libde265.svg?exclude_unsupported=1&columns=3&exclude_sources=modules,site&header=libde265%20packaging%20status)](https://repology.org/project/libheif/versions) License ======= The library `libde265` is distributed under the terms of the GNU Lesser General Public License. The sample applications are distributed under the terms of the MIT license. See `COPYING` for more details. The short video clip in the 'testdata' directory is from the movie 'Girl Shy', which is in the public domain. Copyright (c) 2013-2014 Struktur AG
Copyright (c) 2013-2026 Dirk Farin
Contact: Dirk Farin libde265-1.0.18/cmake/000077500000000000000000000000001515675107500142115ustar00rootroot00000000000000libde265-1.0.18/cmake/config.h.in000066400000000000000000000002641515675107500162360ustar00rootroot00000000000000/* config.h.in - generated by cmake */ #cmakedefine HAVE_MALLOC_H 1 #cmakedefine HAVE_POSIX_MEMALIGN 1 #cmakedefine HAVE_SSE4_1 1 #cmakedefine HAVE_ARM 1 #cmakedefine HAVE_NEON 1 libde265-1.0.18/cmake/toolchains/000077500000000000000000000000001515675107500163545ustar00rootroot00000000000000libde265-1.0.18/cmake/toolchains/arm-linux-gnueabihf.cmake000066400000000000000000000005251515675107500232220ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR arm) set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc) set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++) set(CMAKE_FIND_ROOT_PATH /usr/arm-linux-gnueabihf) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) libde265-1.0.18/cmake/toolchains/mingw-i686.cmake000066400000000000000000000006321515675107500211720ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Windows) set(CMAKE_SYSTEM_PROCESSOR x86) set(CMAKE_C_COMPILER i686-w64-mingw32-gcc-posix) set(CMAKE_CXX_COMPILER i686-w64-mingw32-g++-posix) set(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") libde265-1.0.18/cmake/toolchains/mingw-x86_64.cmake000066400000000000000000000006431515675107500214360ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Windows) set(CMAKE_SYSTEM_PROCESSOR x86_64) set(CMAKE_C_COMPILER x86_64-w64-mingw32-gcc-posix) set(CMAKE_CXX_COMPILER x86_64-w64-mingw32-g++-posix) set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") libde265-1.0.18/dec265/000077500000000000000000000000001515675107500141215ustar00rootroot00000000000000libde265-1.0.18/dec265/CMakeLists.txt000066400000000000000000000013711515675107500166630ustar00rootroot00000000000000add_executable (dec265 dec265.cc) target_link_libraries (dec265 PRIVATE de265) if(SDL2_FOUND) target_sources(dec265 PRIVATE sdl-display.cc) target_compile_definitions(dec265 PRIVATE HAVE_SDL) target_include_directories (dec265 PRIVATE "${SDL2_INCLUDE_DIRS}") target_link_libraries (dec265 PRIVATE ${SDL2_LIBRARIES}) endif() if(MSVC) target_sources(dec265 PRIVATE ../extra/getopt.c ../extra/getopt_long.c ) endif() install (TARGETS dec265 DESTINATION ${CMAKE_INSTALL_BINDIR}) #if(NOT MSVC) # # hdrcopy uses internal APIs that are not available when compiled for Windows # add_executable (hdrcopy hdrcopy.cc) # # target_link_libraries (hdrcopy PRIVATE de265) # # install (TARGETS hdrcopy DESTINATION ${CMAKE_INSTALL_BINDIR}) #endif() libde265-1.0.18/dec265/COPYING000066400000000000000000000021061515675107500151530ustar00rootroot00000000000000 MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. libde265-1.0.18/dec265/dec265.cc000066400000000000000000000550561515675107500154330ustar00rootroot00000000000000/* libde265 example application "dec265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #define DO_MEMORY_LOGGING 0 #include "de265.h" #include #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #ifdef HAVE_MALLOC_H #include #endif #ifndef _MSC_VER #include #include #endif #include "libde265/quality.h" #if HAVE_VIDEOGFX #include using namespace videogfx; #endif #if HAVE_SDL #include "sdl-display.h" #endif #ifndef PRIu32 #define PRIu32 "u" #endif #define BUFFER_SIZE 40960 #define NUM_THREADS 4 const uint32_t kSecurityLimit_MaxNALSize = 100 * 1024 * 1024; // 100 MB int nThreads=0; bool nal_input=false; int quiet=0; bool check_hash=false; bool show_help=false; bool dump_headers=false; bool write_yuv=false; bool output_with_videogfx=false; bool logging=true; bool no_acceleration=false; const char *output_filename = "out.yuv"; uint32_t max_frames=UINT32_MAX; bool write_bytestream=false; const char *bytestream_filename; bool measure_quality=false; bool show_ssim_map=false; bool show_psnr_map=false; const char* reference_filename; FILE* reference_file; int highestTID = 100; int verbosity=0; int disable_deblocking=0; int disable_sao=0; static struct option long_options[] = { {"quiet", no_argument, 0, 'q' }, {"threads", required_argument, 0, 't' }, {"check-hash", no_argument, 0, 'c' }, {"profile", no_argument, 0, 'p' }, {"frames", required_argument, 0, 'f' }, {"output", required_argument, 0, 'o' }, {"dump", no_argument, 0, 'd' }, {"nal", no_argument, 0, 'n' }, {"videogfx", no_argument, 0, 'V' }, {"no-logging", no_argument, 0, 'L' }, {"help", no_argument, 0, 'h' }, {"noaccel", no_argument, 0, '0' }, {"write-bytestream", required_argument,0, 'B' }, {"measure", required_argument, 0, 'm' }, {"ssim", no_argument, 0, 's' }, {"errmap", no_argument, 0, 'e' }, {"highest-TID", required_argument, 0, 'T' }, {"verbose", no_argument, 0, 'v' }, {"disable-deblocking", no_argument, &disable_deblocking, 1 }, {"disable-sao", no_argument, &disable_sao, 1 }, {0, 0, 0, 0 } }; static void write_picture(const de265_image* img) { static FILE* fh = NULL; if (fh==NULL) { if (strcmp(output_filename, "-") == 0) { fh = stdout; } else { fh = fopen(output_filename, "wb"); } } for (int c=0;c<3;c++) { int stride; const uint8_t* p = de265_get_image_plane(img, c, &stride); int width = de265_get_image_width(img,c); if (de265_get_bits_per_pixel(img,c)<=8) { // --- save 8 bit YUV --- for (int y=0;y> 8; } fwrite(buf, width*2, 1, fh); } delete[] buf; } } fflush(fh); } #if HAVE_VIDEOGFX void display_image(const struct de265_image* img) { static X11Win win; // display picture static bool first=true; if (first) { first=false; win.Create(de265_get_image_width(img,0), de265_get_image_height(img,0), "de265 output"); } int width = de265_get_image_width(img,0); int height = de265_get_image_height(img,0); de265_chroma chroma = de265_get_chroma_format(img); ChromaFormat vgfx_chroma; Colorspace vgfx_cs = Colorspace_YUV; switch (chroma) { case de265_chroma_420: vgfx_chroma = Chroma_420; break; case de265_chroma_422: vgfx_chroma = Chroma_422; break; case de265_chroma_444: vgfx_chroma = Chroma_444; break; case de265_chroma_mono: vgfx_cs = Colorspace_Greyscale; break; } Image visu; visu.Create(width, height, vgfx_cs, vgfx_chroma); int nChannels = 3; if (chroma == de265_chroma_mono) { nChannels = 1; } for (int ch=0;ch> (bit_depth-8); } } } } win.Display(visu); win.WaitForKeypress(); } #endif #if HAVE_SDL static uint8_t* convert_to_8bit(const uint8_t* data, int width, int height, int pixelsPerLine, int bit_depth) { const uint16_t* data16 = (const uint16_t*)data; uint8_t* out = new uint8_t[pixelsPerLine*height]; for (int y=0;y> (bit_depth-8); } } return out; } SDL_YUV_Display sdlWin; bool sdl_active=false; bool display_sdl(const struct de265_image* img) { int width = de265_get_image_width(img,0); int height = de265_get_image_height(img,0); int chroma_width = de265_get_image_width(img,1); int chroma_height = de265_get_image_height(img,1); de265_chroma chroma = de265_get_chroma_format(img); if (!sdl_active) { sdl_active=true; enum SDL_YUV_Display::SDL_Chroma sdlChroma; switch (chroma) { case de265_chroma_420: sdlChroma = SDL_YUV_Display::SDL_CHROMA_420; break; case de265_chroma_422: sdlChroma = SDL_YUV_Display::SDL_CHROMA_422; break; case de265_chroma_444: sdlChroma = SDL_YUV_Display::SDL_CHROMA_444; break; case de265_chroma_mono: sdlChroma = SDL_YUV_Display::SDL_CHROMA_MONO; break; default: assert(false); sdlChroma = SDL_YUV_Display::SDL_CHROMA_MONO; } sdlWin.init(width,height, sdlChroma); } int stride,chroma_stride; const uint8_t* y = de265_get_image_plane(img,0,&stride); const uint8_t* cb =de265_get_image_plane(img,1,&chroma_stride); const uint8_t* cr =de265_get_image_plane(img,2,NULL); int bpp_y = (de265_get_bits_per_pixel(img,0)+7)/8; int bpp_c = (de265_get_bits_per_pixel(img,1)+7)/8; int ppl_y = stride/bpp_y; int ppl_c = chroma_stride/bpp_c; uint8_t* y16 = NULL; uint8_t* cb16 = NULL; uint8_t* cr16 = NULL; int bd; if ((bd=de265_get_bits_per_pixel(img, 0)) > 8) { y16 = convert_to_8bit(y, width,height,ppl_y,bd); y=y16; } if (chroma != de265_chroma_mono) { if ((bd=de265_get_bits_per_pixel(img, 1)) > 8) { cb16 = convert_to_8bit(cb, chroma_width,chroma_height,ppl_c,bd); cb=cb16; } if ((bd=de265_get_bits_per_pixel(img, 2)) > 8) { cr16 = convert_to_8bit(cr, chroma_width,chroma_height,ppl_c,bd); cr=cr16; } } sdlWin.display(y,cb,cr, ppl_y, ppl_c); delete[] y16; delete[] cb16; delete[] cr16; return sdlWin.doQuit(); } #endif static int width,height; static uint32_t framecnt=0; bool output_image(const de265_image* img) { bool stop=false; width = de265_get_image_width(img,0); height = de265_get_image_height(img,0); framecnt++; //printf("SHOW POC: %d / PTS: %ld / integrity: %d\n",img->PicOrderCntVal, img->pts, img->integrity); if (0) { const char* nal_unit_name; int nuh_layer_id; int nuh_temporal_id; de265_get_image_NAL_header(img, NULL, &nal_unit_name, &nuh_layer_id, &nuh_temporal_id); printf("NAL: %s layer:%d temporal:%d\n",nal_unit_name, nuh_layer_id, nuh_temporal_id); } if (!quiet) { #if HAVE_SDL && HAVE_VIDEOGFX if (output_with_videogfx) { display_image(img); } else { stop = display_sdl(img); } #elif HAVE_SDL stop = display_sdl(img); #elif HAVE_VIDEOGFX display_image(img); #endif } if (write_yuv) { write_picture(img); } if ((framecnt%100)==0) { fprintf(stderr,"frame %d\r",framecnt); } if (framecnt>=max_frames) { stop=true; } return stop; } static double mse_y=0.0, mse_cb=0.0, mse_cr=0.0; static int mse_frames=0; static double ssim_y=0.0; static int ssim_frames=0; void measure(const de265_image* img) { // --- compute PSNR --- int width = de265_get_image_width(img,0); int height = de265_get_image_height(img,0); uint8_t* p = (uint8_t*)malloc(width*height*3/2); if (p == NULL) { return; } size_t toread = width*height*3/2; if (fread(p,1,toread,reference_file) != toread) { free(p); return; } int stride, cstride; const uint8_t* yptr = de265_get_image_plane(img,0, &stride); const uint8_t* cbptr = de265_get_image_plane(img,1, &cstride); const uint8_t* crptr = de265_get_image_plane(img,2, &cstride); double img_mse_y = MSE( yptr, stride, p, width, width, height); double img_mse_cb = MSE(cbptr, cstride, p+width*height, width/2, width/2,height/2); double img_mse_cr = MSE(crptr, cstride, p+width*height*5/4, width/2, width/2,height/2); mse_frames++; mse_y += img_mse_y; mse_cb += img_mse_cb; mse_cr += img_mse_cr; // --- compute SSIM --- double ssimSum = 0.0; #if HAVE_VIDEOGFX Bitmap ref, coded; ref .Create(width, height); // reference image coded.Create(width, height); // coded image const uint8_t* data; data = de265_get_image_plane(img,0,&stride); for (int y=0;y ssim = ssimAlgo.calcSSIM(ref,coded); Bitmap ssimMap; ssimMap.Create(width,height); for (int y=0;y error_map = CalcErrorMap(ref, coded, TransferCurve_Sqrt); // display PSNR error map if (show_psnr_map) { static X11Win win; static bool first=true; if (first) { first=false; win.Create(de265_get_image_width(img,0), de265_get_image_height(img,0), "psnr output"); } win.Display(MakeImage(error_map)); } // display SSIM error map if (show_ssim_map) { static X11Win win; static bool first=true; if (first) { first=false; win.Create(de265_get_image_width(img,0), de265_get_image_height(img,0), "ssim output"); } win.Display(MakeImage(ssimMap)); } #endif ssim_frames++; ssim_y += ssimSum; printf("%5d %6f %6f %6f %6f\n", framecnt, PSNR(img_mse_y), PSNR(img_mse_cb), PSNR(img_mse_cr), ssimSum); free(p); } #ifdef WIN32 #include #define WIN32_LEAN_AND_MEAN #include int gettimeofday(struct timeval *tp, void *) { time_t clock; struct tm tm; SYSTEMTIME wtm; GetLocalTime(&wtm); tm.tm_year = wtm.wYear - 1900; tm.tm_mon = wtm.wMonth - 1; tm.tm_mday = wtm.wDay; tm.tm_hour = wtm.wHour; tm.tm_min = wtm.wMinute; tm.tm_sec = wtm.wSecond; tm. tm_isdst = -1; clock = mktime(&tm); tp->tv_sec = (long) clock; tp->tv_usec = wtm.wMilliseconds * 1000; return (0); } #endif #ifdef HAVE___MALLOC_HOOK #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #endif static void *(*old_malloc_hook)(size_t, const void *); static void *new_malloc_hook(size_t size, const void *caller) { void *mem; /* if (size>1000000) { raise(SIGINT); } */ __malloc_hook = old_malloc_hook; mem = malloc(size); fprintf(stderr, "%p: malloc(%zu) = %p\n", caller, size, mem); __malloc_hook = new_malloc_hook; return mem; } static void init_my_hooks(void) { old_malloc_hook = __malloc_hook; __malloc_hook = new_malloc_hook; } #if DO_MEMORY_LOGGING void (*volatile __malloc_initialize_hook)(void) = init_my_hooks; #endif #ifdef __GNUC__ #pragma GCC diagnostic pop #endif #endif int parse_param(const char* arg, std::optional lower_bound, std::optional upper_bound, const char* arg_name) { int value; try { size_t len; value = std::stoi(optarg, &len); if (arg[len] != 0) { std::cerr << "invalid argument to " << arg_name << "\n"; exit(5); } } catch (std::invalid_argument const& ex) { std::cerr << "invalid argument to " << arg_name << "\n"; exit(5); } catch (std::out_of_range const& ex) { std::cerr << "argument to -T is out of range\n"; exit(5); } if (lower_bound && value < *lower_bound) { std::cerr << "argument to " << arg_name << " may not be smaller than " << *lower_bound << "\n"; exit(5); } if (upper_bound && value > *upper_bound) { std::cerr << "argument to " << arg_name << " may not be larger than " << *upper_bound << "\n"; exit(5); } return value; } int main(int argc, char** argv) { while (1) { int option_index = 0; int c = getopt_long(argc, argv, "qt:chf:o:dLB:n0vT:m:se" #if HAVE_VIDEOGFX && HAVE_SDL "V" #endif , long_options, &option_index); if (c == -1) break; switch (c) { case 'q': quiet++; break; case 't': nThreads=parse_param(optarg, 0, std::nullopt, "-t"); break; case 'c': check_hash=true; break; case 'f': max_frames=parse_param(optarg, 1, std::nullopt, "-f"); break; case 'o': write_yuv=true; output_filename=optarg; break; case 'h': show_help=true; break; case 'd': dump_headers=true; break; case 'n': nal_input=true; break; case 'V': output_with_videogfx=true; break; case 'L': logging=false; break; case '0': no_acceleration=true; break; case 'B': write_bytestream=true; bytestream_filename=optarg; break; case 'm': measure_quality=true; reference_filename=optarg; break; case 's': show_ssim_map=true; break; case 'e': show_psnr_map=true; break; case 'T': highestTID = parse_param(optarg, 0, std::nullopt, "-T"); break; case 'v': verbosity++; break; } } if (optind != argc-1 || show_help) { fprintf(stderr," dec265 v%s\n", de265_get_version()); fprintf(stderr,"-----------------\n"); fprintf(stderr,"usage: dec265 [options] videofile.bin\n"); fprintf(stderr,"The video file must be a raw bitstream, or a stream with NAL units (option -n).\n"); fprintf(stderr,"\n"); fprintf(stderr,"options:\n"); fprintf(stderr," -q, --quiet do not show decoded image\n"); fprintf(stderr," -t, --threads N set number of worker threads (0 - no threading)\n"); fprintf(stderr," -c, --check-hash perform hash check\n"); fprintf(stderr," -n, --nal input is a stream with 4-byte length prefixed NAL units\n"); fprintf(stderr," -f, --frames N set number of frames to process\n"); fprintf(stderr," -o, --output write YUV reconstruction\n"); fprintf(stderr," -d, --dump dump headers\n"); #if HAVE_VIDEOGFX && HAVE_SDL fprintf(stderr," -V, --videogfx output with videogfx instead of SDL\n"); #endif fprintf(stderr," -0, --noaccel do not use any accelerated code (SSE)\n"); fprintf(stderr," -v, --verbose increase verbosity level (up to 3 times)\n"); fprintf(stderr," -L, --no-logging disable logging\n"); fprintf(stderr," -B, --write-bytestream FILENAME write raw bytestream (from NAL input)\n"); fprintf(stderr," -m, --measure YUV compute PSNRs relative to reference YUV\n"); #if HAVE_VIDEOGFX fprintf(stderr," -s, --ssim show SSIM-map (only when -m active)\n"); fprintf(stderr," -e, --errmap show error-map (only when -m active)\n"); #endif fprintf(stderr," -T, --highest-TID select highest temporal sublayer to decode\n"); fprintf(stderr," --disable-deblocking disable deblocking filter\n"); fprintf(stderr," --disable-sao disable sample-adaptive offset filter\n"); fprintf(stderr," -h, --help show help\n"); exit(show_help ? 0 : 5); } de265_error err =DE265_OK; de265_decoder_context* ctx = de265_new_decoder(); de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH, check_hash); de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES, false); de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_DISABLE_DEBLOCKING, disable_deblocking); de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_DISABLE_SAO, disable_sao); if (dump_headers) { de265_set_parameter_int(ctx, DE265_DECODER_PARAM_DUMP_SPS_HEADERS, 1); de265_set_parameter_int(ctx, DE265_DECODER_PARAM_DUMP_VPS_HEADERS, 1); de265_set_parameter_int(ctx, DE265_DECODER_PARAM_DUMP_PPS_HEADERS, 1); de265_set_parameter_int(ctx, DE265_DECODER_PARAM_DUMP_SLICE_HEADERS, 1); } if (no_acceleration) { de265_set_parameter_int(ctx, DE265_DECODER_PARAM_ACCELERATION_CODE, de265_acceleration_SCALAR); } if (!logging) { de265_disable_logging(); } de265_set_verbosity(verbosity); if (argc>=3) { if (nThreads>0) { err = de265_start_worker_threads(ctx, nThreads); } } de265_set_limit_TID(ctx, highestTID); if (measure_quality) { reference_file = fopen(reference_filename, "rb"); if (reference_file == nullptr) { fprintf(stderr, "Error: cannot create measurement output file '%s'\n", reference_filename); exit(5); } } FILE* fh; if (strcmp(argv[optind],"-")==0) { fh = stdin; } else { fh = fopen(argv[optind], "rb"); } if (fh==NULL) { fprintf(stderr,"cannot open file %s!\n", argv[optind]); exit(10); } FILE* bytestream_fh = NULL; if (write_bytestream) { bytestream_fh = fopen(bytestream_filename, "wb"); } bool stop=false; struct timeval tv_start; gettimeofday(&tv_start, NULL); int pos=0; while (!stop) { //tid = (framecnt/1000) & 1; //de265_set_limit_TID(ctx, tid); if (nal_input) { uint8_t len[4]; int n = fread(len,1,4,fh); uint32_t length = (len[0]<<24) + (len[1]<<16) + (len[2]<<8) + len[3]; if (length > kSecurityLimit_MaxNALSize) { fprintf(stderr, "NAL packet with size %" PRIu32 " exceeds security limit %" PRIu32 ", skipping this NAL.\n", length, kSecurityLimit_MaxNALSize); fseek(fh, length, SEEK_CUR); pos += length; } else { uint8_t* buf = (uint8_t*)malloc(length); n = fread(buf,1,length,fh); err = de265_push_NAL(ctx, buf,n, pos, (void*)1); if (write_bytestream) { uint8_t sc[3] = { 0,0,1 }; fwrite(sc ,1,3,bytestream_fh); fwrite(buf,1,n,bytestream_fh); } free(buf); pos+=n; } } else { // read a chunk of input data uint8_t buf[BUFFER_SIZE]; int n = fread(buf,1,BUFFER_SIZE,fh); // decode input data if (n) { err = de265_push_data(ctx, buf, n, pos, (void*)2); if (err != DE265_OK) { break; } } pos+=n; if (0) { // fake skipping if (pos>1000000) { printf("RESET\n"); de265_reset(ctx); pos=0; fseek(fh,-200000,SEEK_CUR); } } } // printf("pending data: %d\n", de265_get_number_of_input_bytes_pending(ctx)); if (feof(fh)) { err = de265_flush_data(ctx); // indicate end of stream stop = true; } // decoding / display loop int more=1; while (more) { more = 0; // decode some more err = de265_decode(ctx, &more); if (err != DE265_OK) { // if (quiet<=1) fprintf(stderr,"ERROR: %s\n", de265_get_error_text(err)); if (check_hash && err == DE265_ERROR_CHECKSUM_MISMATCH) stop = 1; more = 0; break; } // show available images const de265_image* img = de265_get_next_picture(ctx); if (img) { if (measure_quality) { measure(img); } stop = output_image(img); if (stop) more=0; else more=1; } // show warnings for (;;) { de265_error warning = de265_get_warning(ctx); if (warning==DE265_OK) { break; } if (quiet<=1) fprintf(stderr,"WARNING: %s\n", de265_get_error_text(warning)); } } } fclose(fh); if (write_bytestream) { fclose(bytestream_fh); } if (measure_quality) { printf("#total %6f %6f %6f %6f\n", PSNR(mse_y /mse_frames), PSNR(mse_cb/mse_frames), PSNR(mse_cr/mse_frames), ssim_y/ssim_frames); fclose(reference_file); } de265_free_decoder(ctx); struct timeval tv_end; gettimeofday(&tv_end, NULL); if (err != DE265_OK) { if (quiet<=1) fprintf(stderr,"decoding error: %s (code=%d)\n", de265_get_error_text(err), err); } double secs = tv_end.tv_sec-tv_start.tv_sec; secs += (tv_end.tv_usec - tv_start.tv_usec)*0.001*0.001; if (quiet<=1) fprintf(stderr,"nFrames decoded: %d (%dx%d @ %5.2f fps)\n",framecnt, width,height,framecnt/secs); return err==DE265_OK ? 0 : 10; } libde265-1.0.18/dec265/hdrcopy.cc000066400000000000000000000062711515675107500161060ustar00rootroot00000000000000/* libde265 example application. MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "libde265/nal-parser.h" #include "libde265/decctx.h" #include error_queue errqueue; video_parameter_set vps; seq_parameter_set sps; pic_parameter_set pps; CABAC_encoder_bitstream writer; void process_nal(NAL_unit* nal) { de265_error err = DE265_OK; bitreader reader; bitreader_init(&reader, nal->data(), nal->size()); nal_header nal_hdr; nal_hdr.read(&reader); writer.write_startcode(); nal_hdr.write(writer); printf("NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", nal->data()[0], nal->data()[1], get_NAL_name(nal_hdr.nal_unit_type), nal_hdr.nuh_temporal_id); if (nal_hdr.nal_unit_type<32) { //err = read_slice_NAL(reader, nal, nal_hdr); } else switch (nal_hdr.nal_unit_type) { case NAL_UNIT_VPS_NUT: vps.read(&errqueue, &reader); vps.dump(1); vps.write(&errqueue, writer); writer.flush_VLC(); break; case NAL_UNIT_SPS_NUT: sps.read(&errqueue, &reader); sps.dump(1); sps.write(&errqueue, writer); writer.flush_VLC(); break; case NAL_UNIT_PPS_NUT: //err = read_pps_NAL(reader); break; case NAL_UNIT_PREFIX_SEI_NUT: case NAL_UNIT_SUFFIX_SEI_NUT: //err = read_sei_NAL(reader, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT); break; case NAL_UNIT_EOS_NUT: //ctx->FirstAfterEndOfSequenceNAL = true; break; } } int main(int argc, char** argv) { NAL_Parser nal_parser; FILE* fh = fopen(argv[1],"rb"); unsigned char buf[1024]; writer.write_bits(0,8); // because HM has an extra byte at the beginning while(!feof(fh)) { int n = fread(buf,1,1024,fh); if (n>0) { nal_parser.push_data(buf,n, 0); } if (nal_parser.get_NAL_queue_length()>0) { NAL_unit* nal = nal_parser.pop_from_NAL_queue(); assert(nal); process_nal(nal); nal_parser.free_NAL_unit(nal); } } fclose(fh); fh = fopen("out.bin","wb"); fwrite(writer.data(), 1,writer.size(), fh); fclose(fh); return 0; } libde265-1.0.18/dec265/sdl-display.cc000066400000000000000000000204641515675107500166630ustar00rootroot00000000000000/* This file is part of dec265, an example application using libde265. MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "sdl-display.h" #include bool SDL_YUV_Display::init(int frame_width, int frame_height, enum SDL_Chroma chroma) { // reduce image size to a multiple of 8 (apparently required by YUV overlay) frame_width &= ~7; frame_height &= ~7; mChroma = chroma; if (SDL_Init(SDL_INIT_VIDEO) < 0 ) { printf("SDL_Init() failed: %s\n", SDL_GetError( ) ); SDL_Quit(); return false; } // set window title const char *window_title = "SDL YUV display"; mWindow = SDL_CreateWindow(window_title, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, frame_width, frame_height, 0); if (!mWindow) { printf("SDL: Couldn't set video mode to %dx%d: %s\n", frame_width, frame_height, SDL_GetError()); SDL_Quit(); return false; } Uint32 flags = 0; // Empty flags prioritize SDL_RENDERER_ACCELERATED. mRenderer = SDL_CreateRenderer(mWindow, -1, flags); if (!mRenderer) { printf("SDL: Couldn't create renderer: %s\n", SDL_GetError()); SDL_Quit(); return false; } Uint32 pixelFormat = 0; switch (mChroma) { case SDL_CHROMA_MONO: pixelFormat = SDL_PIXELFORMAT_YV12; break; case SDL_CHROMA_420: pixelFormat = SDL_PIXELFORMAT_YV12; break; case SDL_CHROMA_422: pixelFormat = SDL_PIXELFORMAT_YV12; break; case SDL_CHROMA_444: pixelFormat = SDL_PIXELFORMAT_YV12; break; //case SDL_CHROMA_444: pixelFormat = SDL_PIXELFORMAT_YV12; break; default: printf("Unsupported chroma: %d\n", mChroma); SDL_Quit(); return false; } mTexture = SDL_CreateTexture(mRenderer, pixelFormat, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); if (!mTexture ) { printf("SDL: Couldn't create SDL texture: %s\n", SDL_GetError()); SDL_Quit(); return false; } rect.x = 0; rect.y = 0; rect.w = frame_width; rect.h = frame_height; mWindowOpen=true; return true; } void SDL_YUV_Display::display(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride) { if (!mWindowOpen) return; if (SDL_LockTexture(mTexture, nullptr, reinterpret_cast(&mPixels), &mStride) < 0) return; if (mChroma == SDL_CHROMA_420) { display420(Y,U,V,stride,chroma_stride); } else if (mChroma == SDL_CHROMA_422) { display422(Y,U,V,stride,chroma_stride); } else if (mChroma == SDL_CHROMA_444) { display444as420(Y,U,V,stride,chroma_stride); //display444as422(Y,U,V,stride,chroma_stride); } else if (mChroma == SDL_CHROMA_MONO) { display400(Y,stride); } SDL_UnlockTexture(mTexture); SDL_RenderCopy(mRenderer, mTexture, nullptr, nullptr); SDL_RenderPresent(mRenderer); } void SDL_YUV_Display::display420(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride) { if (stride == mStride && chroma_stride == mStride/2) { // fast copy memcpy(mPixels, Y, rect.w * rect.h); memcpy(&mPixels[rect.w * rect.h], V, rect.w * rect.h / 4); memcpy(&mPixels[(rect.w * rect.h) + (rect.w * rect.h / 4)], U, rect.w * rect.h / 4); } else { // copy line by line, because sizes are different uint8_t *dest = mPixels; for (int y=0;y Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include class SDL_YUV_Display { public: enum SDL_Chroma { SDL_CHROMA_MONO=400, SDL_CHROMA_420 =420, SDL_CHROMA_422 =422, SDL_CHROMA_444 =444 }; bool init(int frame_width, int frame_height, enum SDL_Chroma chroma = SDL_CHROMA_420); void display(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride); void close(); bool doQuit() const; bool isOpen() const { return mWindowOpen; } private: SDL_Window *mWindow = nullptr; SDL_Renderer *mRenderer = nullptr; SDL_Texture *mTexture = nullptr; SDL_Rect rect; bool mWindowOpen; uint8_t *mPixels = nullptr; int mStride = 0; SDL_Chroma mChroma; void display400(const unsigned char *Y, int stride); void display420(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride); void display422(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride); void display444as422(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride); void display444as420(const unsigned char *Y, const unsigned char *U, const unsigned char *V, int stride, int chroma_stride); }; libde265-1.0.18/extra/000077500000000000000000000000001515675107500142545ustar00rootroot00000000000000libde265-1.0.18/extra/getopt.c000066400000000000000000000106031515675107500157220ustar00rootroot00000000000000/* $NetBSD: getopt.c,v 1.16 1999/12/02 13:15:56 kleink Exp $ */ /* * Copyright (c) 1987, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 static char sccsid[] = "@(#)getopt.c 8.3 (Berkeley) 4/27/95"; #endif #include #include #include #include #define __P(x) x #define _DIAGASSERT(x) assert(x) #ifdef __weak_alias __weak_alias(getopt,_getopt); #endif #ifdef __cplusplus extern "C" { #endif int opterr = 1, /* if error message should be printed */ optind = 1, /* index into parent argv vector */ optopt, /* character checked for validity */ optreset; /* reset getopt */ char *optarg; /* argument associated with option */ static char * _progname __P((char *)); int getopt_internal __P((int, char * const *, const char *)); static char * _progname(char * nargv0) { char * tmp; _DIAGASSERT(nargv0 != NULL); tmp = strrchr(nargv0, '/'); if (tmp) tmp++; else tmp = nargv0; return(tmp); } #define BADCH (int)'?' #define BADARG (int)':' #define EMSG "" /* * getopt -- * Parse argc/argv argument vector. */ int getopt(int nargc, char * nargv[], const char *ostr) { static char *__progname = 0; static char *place = EMSG; /* option letter processing */ const char *oli; /* option letter list index */ __progname = __progname?__progname:_progname(*nargv); _DIAGASSERT(nargv != NULL); _DIAGASSERT(ostr != NULL); if (optreset || !*place) { /* update scanning pointer */ optreset = 0; if (optind >= nargc || *(place = nargv[optind]) != '-') { place = EMSG; return (-1); } if (place[1] && *++place == '-' /* found "--" */ && place[1] == '\0') { ++optind; place = EMSG; return (-1); } } /* option letter okay? */ if ((optopt = (int)*place++) == (int)':' || !(oli = strchr(ostr, optopt))) { /* * if the user didn't specify '-' as an option, * assume it means -1. */ if (optopt == (int)'-') return (-1); if (!*place) ++optind; if (opterr && *ostr != ':') (void)fprintf(stderr, "%s: illegal option -- %c\n", __progname, optopt); return (BADCH); } if (*++oli != ':') { /* don't need argument */ optarg = NULL; if (!*place) ++optind; } else { /* need an argument */ if (*place) /* no white space */ optarg = place; else if (nargc <= ++optind) { /* no arg */ place = EMSG; if (*ostr == ':') return (BADARG); if (opterr) (void)fprintf(stderr, "%s: option requires an argument -- %c\n", __progname, optopt); return (BADCH); } else /* white space */ optarg = nargv[optind]; place = EMSG; ++optind; } return (optopt); /* dump back option letter */ } #ifdef __cplusplus } #endif libde265-1.0.18/extra/getopt.h000066400000000000000000000047211515675107500157330ustar00rootroot00000000000000/* * Copyright (c) 1987, 1993, 1994, 1996 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef __GETOPT_H__ #define __GETOPT_H__ #ifdef __cplusplus extern "C" { #endif extern int opterr; /* if error message should be printed */ extern int optind; /* index into parent argv vector */ extern int optopt; /* character checked for validity */ extern int optreset; /* reset getopt */ extern char *optarg; /* argument associated with option */ struct option { const char *name; int has_arg; int *flag; int val; }; #define no_argument 0 #define required_argument 1 #define optional_argument 2 int getopt(int, char**, char*); int getopt_long(int, char**, char*, struct option*, int*); #ifdef __cplusplus } #endif #endif /* __GETOPT_H__ */ libde265-1.0.18/extra/getopt_long.c000066400000000000000000000144011515675107500167410ustar00rootroot00000000000000/* * Copyright (c) 1987, 1993, 1994, 1996 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include "getopt.h" extern int opterr; /* if error message should be printed */ extern int optind; /* index into parent argv vector */ extern int optopt; /* character checked for validity */ extern int optreset; /* reset getopt */ extern char *optarg; /* argument associated with option */ #define __P(x) x #define _DIAGASSERT(x) assert(x) static char * __progname __P((char *)); int getopt_internal __P((int, char * const *, const char *)); static char * __progname(char * nargv0) { char * tmp; _DIAGASSERT(nargv0 != NULL); tmp = strrchr(nargv0, '/'); if (tmp) tmp++; else tmp = nargv0; return(tmp); } #define BADCH (int)'?' #define BADARG (int)':' #define EMSG "" /* * getopt -- * Parse argc/argv argument vector. */ int getopt_internal(int nargc, char ** nargv, const char *ostr) { static char *place = EMSG; /* option letter processing */ const char *oli; /* option letter list index */ _DIAGASSERT(nargv != NULL); _DIAGASSERT(ostr != NULL); if (optreset || !*place) { /* update scanning pointer */ optreset = 0; if (optind >= nargc || *(place = nargv[optind]) != '-') { place = EMSG; return (-1); } if (place[1] && *++place == '-') { /* found "--" */ /* ++optind; */ place = EMSG; return (-2); } } /* option letter okay? */ if ((optopt = (int)*place++) == (int)':' || !(oli = strchr(ostr, optopt))) { /* * if the user didn't specify '-' as an option, * assume it means -1. */ if (optopt == (int)'-') return (-1); if (!*place) ++optind; if (opterr && *ostr != ':') (void)fprintf(stderr, "%s: illegal option -- %c\n", __progname(nargv[0]), optopt); return (BADCH); } if (*++oli != ':') { /* don't need argument */ optarg = NULL; if (!*place) ++optind; } else { /* need an argument */ if (*place) /* no white space */ optarg = place; else if (nargc <= ++optind) { /* no arg */ place = EMSG; if ((opterr) && (*ostr != ':')) (void)fprintf(stderr, "%s: option requires an argument -- %c\n", __progname(nargv[0]), optopt); return (BADARG); } else /* white space */ optarg = nargv[optind]; place = EMSG; ++optind; } return (optopt); /* dump back option letter */ } #if 0 /* * getopt -- * Parse argc/argv argument vector. */ int getopt2(int nargc, char * nargv, const char *ostr) { int retval; if ((retval = getopt_internal(nargc, nargv, ostr)) == -2) { retval = -1; ++optind; } return(retval); } #endif /* * getopt_long -- * Parse argc/argv argument vector. */ int getopt_long(int nargc, char ** nargv, char * options, struct option * long_options, int * index) { int retval; _DIAGASSERT(nargv != NULL); _DIAGASSERT(options != NULL); _DIAGASSERT(long_options != NULL); /* index may be NULL */ if ((retval = getopt_internal(nargc, nargv, options)) == -2) { char *current_argv = nargv[optind++] + 2, *has_equal; int i, current_argv_len, match = -1; if (*current_argv == '\0') { return(-1); } if ((has_equal = strchr(current_argv, '=')) != NULL) { current_argv_len = has_equal - current_argv; has_equal++; } else current_argv_len = strlen(current_argv); for (i = 0; long_options[i].name; i++) { if (strncmp(current_argv, long_options[i].name, current_argv_len)) continue; if (strlen(long_options[i].name) == (unsigned)current_argv_len) { match = i; break; } if (match == -1) match = i; } if (match != -1) { if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) { if (has_equal) optarg = has_equal; else optarg = nargv[optind++]; } if ((long_options[match].has_arg == required_argument) && (optarg == NULL)) { /* * Missing argument, leading : * indicates no error should be generated */ if ((opterr) && (*options != ':')) (void)fprintf(stderr, "%s: option requires an argument -- %s\n", __progname(nargv[0]), current_argv); return (BADARG); } } else { /* No matching argument */ if ((opterr) && (*options != ':')) (void)fprintf(stderr, "%s: illegal option -- %s\n", __progname(nargv[0]), current_argv); return (BADCH); } if (long_options[match].flag) { *long_options[match].flag = long_options[match].val; retval = 0; } else retval = long_options[match].val; if (index) *index = match; } return(retval); } libde265-1.0.18/extra/libde265/000077500000000000000000000000001515675107500155705ustar00rootroot00000000000000libde265-1.0.18/extra/libde265/de265-version.h000066400000000000000000000017261515675107500202570ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef LIBDE265_VERSION_H #define LIBDE265_VERSION_H /* Numeric representation of the version */ #define LIBDE265_NUMERIC_VERSION 0x01001600 #define LIBDE265_VERSION "1.0.16" #endif libde265-1.0.18/extra/win32cond.c000066400000000000000000000110341515675107500162250ustar00rootroot00000000000000/** * pthread_cond API for Win32 * * ACE(TM), TAO(TM), CIAO(TM), DAnCE>(TM), and CoSMIC(TM) (henceforth * referred to as "DOC software") are copyrighted by Douglas C. Schmidt * and his research group at Washington University, University of California, * Irvine, and Vanderbilt University, Copyright (c) 1993-2009, all rights * reserved. * * Since DOC software is open-source, freely available software, you are free * to use, modify, copy, and distribute--perpetually and irrevocably--the DOC * software source code and object code produced from the source, as well as * copy and distribute modified versions of this software. You must, however, * include this copyright statement along with any code built using DOC * software that you release. * * No copyright statement needs to be provided if you just ship binary * executables of your software products. * * See "Strategies for Implementing POSIX Condition Variables on Win32" at * http://www.cs.wustl.edu/~schmidt/win32-cv-1.html */ #include #include "win32cond.h" int win32_cond_init(win32_cond_t *cv) { cv->waiters_count_ = 0; cv->was_broadcast_ = 0; cv->sema_ = CreateSemaphore (NULL, // no security 0, // initially 0 0x7fffffff, // max count NULL); // unnamed InitializeCriticalSection (&cv->waiters_count_lock_); cv->waiters_done_ = CreateEvent (NULL, // no security FALSE, // auto-reset FALSE, // non-signaled initially NULL); // unnamed return 0; } int win32_cond_destroy(win32_cond_t *cv) { CloseHandle(cv->waiters_done_); DeleteCriticalSection(&cv->waiters_count_lock_); CloseHandle(cv->sema_); return 0; } int win32_cond_wait(win32_cond_t *cv, HANDLE *external_mutex) { int last_waiter; // Avoid race conditions. EnterCriticalSection (&cv->waiters_count_lock_); cv->waiters_count_++; LeaveCriticalSection (&cv->waiters_count_lock_); // This call atomically releases the mutex and waits on the // semaphore until or // are called by another thread. SignalObjectAndWait (*external_mutex, cv->sema_, INFINITE, FALSE); // Reacquire lock to avoid race conditions. EnterCriticalSection (&cv->waiters_count_lock_); // We're no longer waiting... cv->waiters_count_--; // Check to see if we're the last waiter after . last_waiter = cv->was_broadcast_ && cv->waiters_count_ == 0; LeaveCriticalSection (&cv->waiters_count_lock_); // If we're the last waiter thread during this particular broadcast // then let all the other threads proceed. if (last_waiter) // This call atomically signals the event and waits until // it can acquire the . This is required to ensure fairness. SignalObjectAndWait (cv->waiters_done_, *external_mutex, INFINITE, FALSE); else // Always regain the external mutex since that's the guarantee we // give to our callers. WaitForSingleObject (*external_mutex, INFINITE); return 0; } int win32_cond_signal(win32_cond_t *cv) { int have_waiters; EnterCriticalSection (&cv->waiters_count_lock_); have_waiters = cv->waiters_count_ > 0; LeaveCriticalSection (&cv->waiters_count_lock_); // If there aren't any waiters, then this is a no-op. if (have_waiters) ReleaseSemaphore (cv->sema_, 1, 0); return 0; } int win32_cond_broadcast(win32_cond_t *cv) { int have_waiters = 0; // This is needed to ensure that and are // consistent relative to each other. EnterCriticalSection (&cv->waiters_count_lock_); if (cv->waiters_count_ > 0) { // We are broadcasting, even if there is just one waiter... // Record that we are broadcasting, which helps optimize // for the non-broadcast case. cv->was_broadcast_ = 1; have_waiters = 1; } if (have_waiters) { // Wake up all the waiters atomically. ReleaseSemaphore (cv->sema_, cv->waiters_count_, 0); LeaveCriticalSection (&cv->waiters_count_lock_); // Wait for all the awakened threads to acquire the counting // semaphore. WaitForSingleObject (cv->waiters_done_, INFINITE); // This assignment is okay, even without the held // because no other waiter threads can wake up to access it. cv->was_broadcast_ = 0; } else LeaveCriticalSection (&cv->waiters_count_lock_); return 0; } libde265-1.0.18/extra/win32cond.h000066400000000000000000000037221515675107500162370ustar00rootroot00000000000000#ifndef WIN32COND_H #define WIN32COND_H /** * pthread_cond API for Win32 * * ACE(TM), TAO(TM), CIAO(TM), DAnCE>(TM), and CoSMIC(TM) (henceforth * referred to as "DOC software") are copyrighted by Douglas C. Schmidt * and his research group at Washington University, University of California, * Irvine, and Vanderbilt University, Copyright (c) 1993-2009, all rights * reserved. * * Since DOC software is open-source, freely available software, you are free * to use, modify, copy, and distribute--perpetually and irrevocably--the DOC * software source code and object code produced from the source, as well as * copy and distribute modified versions of this software. You must, however, * include this copyright statement along with any code built using DOC * software that you release. * * No copyright statement needs to be provided if you just ship binary * executables of your software products. * * See "Strategies for Implementing POSIX Condition Variables on Win32" at * http://www.cs.wustl.edu/~schmidt/win32-cv-1.html */ #include typedef struct { long waiters_count_; // Number of waiting threads. CRITICAL_SECTION waiters_count_lock_; // Serialize access to . HANDLE sema_; // Semaphore used to queue up threads waiting for the condition to // become signaled. HANDLE waiters_done_; // An auto-reset event used by the broadcast/signal thread to wait // for all the waiting thread(s) to wake up and be released from the // semaphore. size_t was_broadcast_; // Keeps track of whether we were broadcasting or signaling. This // allows us to optimize the code if we're just signaling. } win32_cond_t; #ifdef __cplusplus extern "C" { #endif int win32_cond_init(win32_cond_t *cv); int win32_cond_destroy(win32_cond_t *cv); int win32_cond_wait(win32_cond_t *cv, HANDLE *external_mutex); int win32_cond_signal(win32_cond_t *cv); int win32_cond_broadcast(win32_cond_t *cv); #ifdef __cplusplus } #endif #endif libde265-1.0.18/fuzzing/000077500000000000000000000000001515675107500146255ustar00rootroot00000000000000libde265-1.0.18/fuzzing/CMakeLists.txt000066400000000000000000000004671515675107500173740ustar00rootroot00000000000000add_executable(stream_fuzzer stream_fuzzer.cc) target_link_libraries(stream_fuzzer PRIVATE de265) target_link_options(stream_fuzzer PRIVATE "-fsanitize=fuzzer") add_executable(nal_fuzzer nal_fuzzer.cc) target_link_libraries(nal_fuzzer PRIVATE de265) target_link_options(nal_fuzzer PRIVATE "-fsanitize=fuzzer") libde265-1.0.18/fuzzing/nal_fuzzer.cc000066400000000000000000000041371515675107500173200ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2026 Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "de265.h" #include #include extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { de265_decoder_context* ctx = de265_new_decoder(); if (!ctx) { return 0; } de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH, 0); de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES, 0); de265_set_parameter_int(ctx, DE265_DECODER_PARAM_ACCELERATION_CODE, de265_acceleration_SCALAR); // Interpret input as a sequence of length-prefixed NAL units. // Each NAL is preceded by a 4-byte big-endian length. size_t pos = 0; while (pos + 4 <= size) { uint32_t nal_size = (uint32_t(data[pos]) << 24) | (uint32_t(data[pos + 1]) << 16) | (uint32_t(data[pos + 2]) << 8) | uint32_t(data[pos + 3]); pos += 4; if (nal_size > size - pos) { break; } de265_push_NAL(ctx, data + pos, nal_size, 0, nullptr); pos += nal_size; } de265_flush_data(ctx); int more = 1; while (more) { de265_error err = de265_decode(ctx, &more); if (!more || (err != DE265_OK && err != DE265_ERROR_WAITING_FOR_INPUT_DATA)) { break; } while (const de265_image* img = de265_get_next_picture(ctx)) { (void)img; } } de265_free_decoder(ctx); return 0; } libde265-1.0.18/fuzzing/stream_fuzzer.cc000066400000000000000000000031641515675107500200400ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2026 Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "de265.h" #include #include extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { de265_decoder_context* ctx = de265_new_decoder(); if (!ctx) { return 0; } de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH, 0); de265_set_parameter_bool(ctx, DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES, 0); de265_set_parameter_int(ctx, DE265_DECODER_PARAM_ACCELERATION_CODE, de265_acceleration_SCALAR); de265_push_data(ctx, data, size, 0, nullptr); de265_flush_data(ctx); int more = 1; while (more) { de265_error err = de265_decode(ctx, &more); if (!more || (err != DE265_OK && err != DE265_ERROR_WAITING_FOR_INPUT_DATA)) { break; } while (const de265_image* img = de265_get_next_picture(ctx)) { (void)img; } } de265_free_decoder(ctx); return 0; } libde265-1.0.18/libde265.pc.in000066400000000000000000000004511515675107500153760ustar00rootroot00000000000000prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: libde265 Description: H.265/HEVC video decoder. URL: https://github.com/strukturag/libde265 Version: @PROJECT_VERSION@ Requires: Libs: -L${libdir} -lde265 Libs.private: @LIBS_PRIVATE@ Cflags: -I${includedir} libde265-1.0.18/libde265.png000066400000000000000000000202111515675107500151470ustar00rootroot00000000000000PNG  IHDR}~JtEXtSoftwareAdobe ImageReadyqe< +IDATx}U} F3*>t*fiv/u?jFcQ֨T'ZM Qf< K ĈDy==IOy<ٳw)׷B)eQB)#XB(HBBB B B!$B!$B!D!D!P!P! ! !BA"BA"B(HB(HBBB B B!$B!$BHW0vRK*J}3Uju,m\gJ{J?S_j7FNkeDeIJݿT3UV!ϟ,XqS/ BsCr||e%Q2E=uY]uΌ 3:fDIۗdGJ#ɻW}ړ^o}1N0TT߿|'~ JW\y̯wQo?a w!e X-⾥J]1/@)F>fj5500^5@)Ju M2{6 5qBBʈ4t.?uu5(ѯ%XL[HAٽ/_BORj)~貛QW9 J_тovoI1mھzC1RWߪ˯9VMXH!A Ϭ[C nRy_[ OmRui 닲]l_R40_zv q-Dj {5fSEr_#$,#Qd_@sH8\GBlak׀H`] n[LONvg &8LBҀeGA*ב"e-JXhC߁svx.I70,żvR%p)eE5(!YAQ1duFsmgi)_TpmR+U_TT 8\GPi ޳k|iFE7KpBu R:^f~DW @Gd.69h E5uWt:Ny.:~q'E3 >ızo)/{–>ː奆6M- ŬyCuK0T Cã?av=qc9GգDHUYE!^jӲC̳ =T-ӖSpl uG?[7#X. R^ẏonW7pԴ>>ۿ_zٟ? >o=j3ձOk{6\P~EԼYƱshL]s14Ԛ<V\xAIj$H5/B/-\A:K h\tuaz?kW/T=G !#Q&M( P돬 ;3T(у*K4! iB>%q~%: FBJ00ˮ.F(HCX'dc}]!77Vvq*Q:~񧂎i `Ydz,dѮڃB"%nßnh]?"DR բP?ԫR/^WƽH1^5ުh @/*!ċ/ut<%sC?TQZoZHiUص8uJぶv[e&E*b([ܚ3Npa!{R[.g,털y)h!u + CE[*s+'O F|E0~6 8o٦~7ۖk{ۿ[AB*iVԺۗ+D5Wk0{12Va )Wa$BhEZGV"(k,dBw6UM/fcKk렝"ٰDH0?cz DBk. Bh8^\b˨h>~A'ݹ[{AvwˢlWO,fۏ}Z{u$B "$6@!u?Bs 5?p0AA| 6巕PrKs _M}p9 l9B*^fŒy: -$dర&pQH\dn5H`)El^)6 !F݁ѷX4;[ФTa ʰ`f!ERC`ZH1W*_8dG|8{ = YCl|a~)V:ʅK7G˚>t:4د4l_0hRe8DH SY덴uJ뢀@ξ*PKXF$9C!iygHb1|`^b #JK'HH/[U8D ۮ|ڻFvb#T%{*(aB\S,]XE!ˎm Z4U#xm|L0/o uqiᠨpDHYF HY!RڦPA>-MϾBu7 ^Yuj`/'IOҵelR7]bWfhX+ĕK%1\T$׫#>}hO'燍U~|aI'7U7 6XvD7S `^s˸ίݣ~6bIx{5ljm=>b¶CC\SflE1{И;qR_o,zّ_ #x"FX @'M:NtXo2|q8-EЭO3!Ci{7 ӡ|ʿ6]*ι.L]XUB4 R ~-ʦk슊e֍`hi^(#{*1E`om5[u9 d9 \5qT.,KJ R(-YH)oVXz-H3r1Ԯ1r9 l:gغ&1lh!iPϑ,ke|uk* õd!%qTz/zԤ K37W=mj3u#*p*pVv RML0(+kJLMX 5wv۴]1}Fib%ֳմ+/V=/TőU4RrAjp]bk\Fzۢ@0kI_gUI@@yy] Q^3a/ `b!uSǞTVؼfm߿ReAܸvʢ:X U#D{]0:*F.W !e告^v ˮ\DƛVI*h#-Y`%m^tGx>Vm̳(aCtyD!<Y(P:v酗1ʦRϲJB>a8]b >vsGyByLo!V,N oY[V`8EZZLLH4,L\ <` IJt|3ZB&p1kM"2__^u 9rK9\GZR䧫*Iq Ua="fs}͆b귊>H0i:w[K;e\c-'2 F ֲ8GuܜGUFL瘦Yn Q܇շm!B:3BX%EQhLy[tW ,:ק/^LA"'熊Q'! _ňD!mk]nauP!]bvCBݮq淮$Er !b`\``q$ l(bUem):5B;M)) !d RJ" !FǯJR!`~08,9(ZD$B.F"K/=z !$J5]>5m%SCrDiď1kIH0ZB(Hki#XB(HBBB B B!$B!$B!D!D!P!P! ! !BA"BA"B(HB(HBH(s9w__ڷoߠczM[Ua덡!{c?z%x\[ E:GLyBe=G]6j]YҴdYþ#:1煖>.1v.9Ĕ|YUlY겑wi̳͊?:XۈPUSA n ᕍ\P Ӑ̬. vn24,7$7Ia $ksCV+k۰ U&(۹ȓP5e㼐s %|f1ov#76 4;r,&fզ:Hw&f]A~opldi̪Gz|FEvv*+H!/|hml/%\="}1vlA &_<ٳ(W*Q i<|>ABmeb>{׹5]p=yMx|ɹ@b^#bͷe[c[]sXFw}g+__:jUd]ft,,ϩ|l@^=FKM_Cszc2egk>>E5t>(=B:˵KW+<>h+KKWYb5|ͫ-^ݱy>Z"]&!LkB Y${ oZGתo̻(gG5e[G /ˡv_kڬYC,۵c-<_ٙu5Zf ʒY4V;YW2i>׶z%˪dr,u4u.!Bia'z)h gH,또ێK:/Wμ6*{rݘ\d\c;罷o}7,;beTB8iBB,[H|2Ң -KՆ4MiE L"2sdi i9Jzkq*BJ.HfCQSxAdGϼ6EH'xbP^0\ʉ yk"J*!B&_F"^ʶwЬG}^t\|=lWJW6gd/ ҧ#\ ʅ gXE6o%mnlu-yU<"˗{lh.f΋zQ +]whdPt#ښuOsuubcIAA^z᪗RÿaeiPR\_{ͺ ӊ^ 8],\P]miCȸ؎CC9$9&cr\7+5j28\q5f[+-\m9]1|s4*`ts-u>\׶qke[2ܙkGm.&Oݴy\B,K6.-[;<>Urz̲֫672vyNpnš! i&zm͕ةӡANC#{X6Hv5(F3JOL\w@br=)(Σ}ItWyEiuxrEmb=L-=8qކ)I؆0b` oW0=EC \,;y\ue{=u{|Ȳ9_ٛvVά=| 3]rYm@mu9Jzga.3YHRR~FCvܪ=٣=6ߐ&jmYq V mI;&TqfplWd\[|\ ܆&aUG$n Wbnnp(Fl஠Ķ]5Dƕ1~iUatH܆[yRz\ςD!" BA"B(HB(HBBB B B!$B!$B!D!D!P!P! ! !BA"BA"B(HB(HBBB B B! (IENDB`libde265-1.0.18/libde265/000077500000000000000000000000001515675107500144455ustar00rootroot00000000000000libde265-1.0.18/libde265/CMakeLists.txt000066400000000000000000000111251515675107500172050ustar00rootroot00000000000000include(CMakePackageConfigHelpers) set (libde265_sources alloc_pool.cc bitstream.cc cabac.cc contextmodel.cc de265.cc deblock.cc decctx.cc dpb.cc fallback-dct.cc fallback-motion.cc fallback.cc image-io.cc image.cc intrapred.cc md5.cc motion.cc nal-parser.cc nal.cc pps.cc quality.cc refpic.cc sao.cc scan.cc sei.cc slice.cc sps.cc threads.cc transform.cc util.cc visualize.cc vps.cc vui.cc ) set (libde265_headers acceleration.h alloc_pool.h bitstream.h cabac.h de265-version.h contextmodel.h de265.h deblock.h decctx.h dpb.h fallback-dct.h fallback-motion.h fallback.h image-io.h image.h intrapred.h md5.h motion.h nal-parser.h nal.h pps.h quality.h refpic.h sao.h scan.h sei.h slice.h sps.h threads.h transform.h util.h visualize.h vps.h vui.h ) set (libde265_public_headers de265.h ${CMAKE_CURRENT_BINARY_DIR}/de265-version.h ) if(MSVC OR MINGW) set (libde265_sources ${libde265_sources} ../extra/win32cond.c ../extra/win32cond.h ) endif() add_definitions(-DLIBDE265_EXPORTS) if (ENABLE_ENCODER) add_subdirectory (encoder) list(APPEND libde265_sources en265.cc) list(APPEND libde265_headers en265.h) list(APPEND libde265_public_headers en265.h) endif() if(HAVE_X86) if(SUPPORTS_SSE4_1 OR (SUPPORTS_SSE2 AND SUPPORTS_SSSE3)) add_subdirectory (x86) endif() endif() if(HAVE_ARM) add_subdirectory (arm) endif() add_library(de265 ${libde265_sources} ${libde265_public_headers} ${ENCODER_OBJECTS} ${X86_OBJECTS} ${ARM_OBJECTS}) target_include_directories(de265 PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) write_basic_package_version_file(libde265-config-version.cmake COMPATIBILITY ExactVersion) # --- debug output set(LOG_LEVELS error info debug trace) set(DE265_LOG_LEVEL "error" CACHE STRING "Log level (${LOG_LEVELS})") set_property(CACHE DE265_LOG_LEVEL PROPERTY STRINGS ${LOG_LEVELS}) if (NOT DE265_LOG_LEVEL IN_LIST LOG_LEVELS) message(FATAL_ERROR "DE265_LOG_LEVEL has to be one of: ${LOG_LEVELS}") endif () if (DE265_LOG_LEVEL MATCHES "error") target_compile_definitions(de265 PRIVATE DE265_LOG_ERROR) elseif (DE265_LOG_LEVEL MATCHES "info") target_compile_definitions(de265 PRIVATE DE265_LOG_ERROR DE265_LOG_INFO) elseif (DE265_LOG_LEVEL MATCHES "debug") target_compile_definitions(de265 PRIVATE DE265_LOG_ERROR DE265_LOG_INFO DE265_LOG_DEBUG) elseif (DE265_LOG_LEVEL MATCHES "trace") target_compile_definitions(de265 PRIVATE DE265_LOG_ERROR DE265_LOG_INFO DE265_LOG_DEBUG DE265_LOG_TRACE) endif() set_target_properties(de265 PROPERTIES VERSION "${DE265_LIBRARY_VERSION}" SOVERSION "${DE265_SOVERSION}" ) if (WIN32) set_target_properties(de265 PROPERTIES PREFIX "lib") endif() if (BUILD_FRAMEWORK) set_target_properties(de265 PROPERTIES FRAMEWORK TRUE FRAMEWORK_VERSION "${PROJECT_VERSION}" PRODUCT_BUNDLE_IDENTIFIER "github.com/strukturag/libde265" XCODE_ATTRIBUTE_INSTALL_PATH "@rpath" # OUTPUT_NAME "de265" XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY "" XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO" XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO" PUBLIC_HEADER "${libde265_public_headers}" MACOSX_FRAMEWORK_IDENTIFIER "github.com/strukturag/libde265" MACOSX_FRAMEWORK_BUNDLE_VERSION "${PROJECT_VERSION}" MACOSX_RPATH TRUE) endif() install(TARGETS de265 EXPORT libde265-config RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} FRAMEWORK DESTINATION Library/Frameworks COMPONENT runtime OPTIONAL INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libde265 ) install(FILES ${libde265_public_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libde265) install(EXPORT libde265-config DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/libde265") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libde265-config-version.cmake DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/libde265") # --- pkg-config set(prefix ${CMAKE_INSTALL_PREFIX}) set(exec_prefix "\${prefix}") if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}") set(libdir "${CMAKE_INSTALL_LIBDIR}") else() set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") endif() if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}") set(includedir "${CMAKE_INSTALL_INCLUDEDIR}") else() set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") endif() configure_file(../libde265.pc.in ${CMAKE_CURRENT_BINARY_DIR}/libde265.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libde265.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) libde265-1.0.18/libde265/COPYING000066400000000000000000000167431515675107500155130ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. libde265-1.0.18/libde265/acceleration.h000066400000000000000000000553211515675107500172550ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_ACCELERATION_H #define DE265_ACCELERATION_H #include #include #include struct acceleration_functions { void (*put_weighted_pred_avg_8)(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height); void (*put_unweighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height); void (*put_weighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD); void (*put_weighted_bipred_8)(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD); void (*put_weighted_pred_avg_16)(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth); void (*put_unweighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth); void (*put_weighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth); void (*put_weighted_bipred_16)(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth); void put_weighted_pred_avg(void *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth) const; void put_unweighted_pred(void *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth) const; void put_weighted_pred(void *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth) const; void put_weighted_bipred(void *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const; void (*put_hevc_epel_8)(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer); void (*put_hevc_epel_h_8)(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_epel_v_8)(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_epel_hv_8)(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_qpel_8[4][4])(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void (*put_hevc_epel_16)(int16_t *dst, ptrdiff_t dststride, const uint16_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_epel_h_16)(int16_t *dst, ptrdiff_t dststride, const uint16_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_epel_v_16)(int16_t *dst, ptrdiff_t dststride, const uint16_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_epel_hv_16)(int16_t *dst, ptrdiff_t dststride, const uint16_t *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void (*put_hevc_qpel_16[4][4])(int16_t *dst, ptrdiff_t dststride, const uint16_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer, int bit_depth); void put_hevc_epel(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const; void put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const; void put_hevc_qpel(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer, int dX,int dY, int bit_depth) const; // --- inverse transforms --- void (*transform_bypass)(int32_t *residual, const int16_t *coeffs, int nT); void (*transform_bypass_rdpcm_v)(int32_t *r, const int16_t *coeffs, int nT); void (*transform_bypass_rdpcm_h)(int32_t *r, const int16_t *coeffs, int nT); // 8 bit void (*transform_skip_8)(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); // no transform void (*transform_skip_rdpcm_v_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride); void (*transform_skip_rdpcm_h_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride); void (*transform_4x4_dst_add_8)(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDST void (*transform_add_8[4])(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDCT // 9-16 bit void (*transform_skip_16)(uint16_t *_dst, const int16_t *coeffs, ptrdiff_t _stride, int bit_depth); // no transform void (*transform_4x4_dst_add_16)(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDST void (*transform_add_16[4])(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDCT void (*rotate_coefficients)(int16_t *coeff, int nT); void (*transform_idst_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void (*transform_idct_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void (*transform_idct_8x8)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void (*transform_idct_16x16)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits); void (*transform_idct_32x32)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits); void (*add_residual_8)(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth); void (*add_residual_16)(uint16_t *dst,ptrdiff_t stride,const int32_t* r, int nT, int bit_depth); template void add_residual(pixel_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const; void (*rdpcm_v)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift); void (*rdpcm_h)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift); void (*transform_skip_residual)(int32_t *residual, const int16_t *coeffs, int nT, int tsShift,int bdShift); template void transform_skip(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; template void transform_skip_rdpcm_v(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const; template void transform_skip_rdpcm_h(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const; template void transform_4x4_dst_add(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; template void transform_add(int sizeIdx, pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; // --- forward transforms --- void (*fwd_transform_4x4_dst_8)(int16_t *coeffs, const int16_t* src, ptrdiff_t stride); // fDST // indexed with (log2TbSize-2) void (*fwd_transform_8[4]) (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); // fDCT // forward Hadamard transform (without scaling factor) // (4x4,8x8,16x16,32x32) indexed with (log2TbSize-2) void (*hadamard_transform_8[4]) (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); }; /* template <> inline void acceleration_functions::put_weighted_pred_avg(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth) { put_weighted_pred_avg_8(_dst,dststride,src1,src2,srcstride,width,height); } template <> inline void acceleration_functions::put_weighted_pred_avg(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth) { put_weighted_pred_avg_16(_dst,dststride,src1,src2, srcstride,width,height,bit_depth); } template <> inline void acceleration_functions::put_unweighted_pred(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth) { put_unweighted_pred_8(_dst,dststride,src,srcstride,width,height); } template <> inline void acceleration_functions::put_unweighted_pred(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth) { put_unweighted_pred_16(_dst,dststride,src,srcstride,width,height,bit_depth); } template <> inline void acceleration_functions::put_weighted_pred(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth) { put_weighted_pred_8(_dst,dststride,src,srcstride,width,height,w,o,log2WD); } template <> inline void acceleration_functions::put_weighted_pred(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth) { put_weighted_pred_16(_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); } template <> inline void acceleration_functions::put_weighted_bipred(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_8(_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD); } template <> inline void acceleration_functions::put_weighted_bipred(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_16(_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD,bit_depth); } */ inline void acceleration_functions::put_weighted_pred_avg(void* _dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth) const { if (bit_depth <= 8) put_weighted_pred_avg_8((uint8_t*)_dst,dststride,src1,src2,srcstride,width,height); else put_weighted_pred_avg_16((uint16_t*)_dst,dststride,src1,src2,srcstride,width,height,bit_depth); } inline void acceleration_functions::put_unweighted_pred(void* _dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth) const { if (bit_depth <= 8) put_unweighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height); else put_unweighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,bit_depth); } inline void acceleration_functions::put_weighted_pred(void* _dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth) const { if (bit_depth <= 8) put_weighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD); else put_weighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); } inline void acceleration_functions::put_weighted_bipred(void* _dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const { if (bit_depth <= 8) put_weighted_bipred_8((uint8_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD); else put_weighted_bipred_16((uint16_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD,bit_depth); } inline void acceleration_functions::put_hevc_epel(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const { if (bit_depth <= 8) put_hevc_epel_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer); else put_hevc_epel_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); } inline void acceleration_functions::put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const { if (bit_depth <= 8) put_hevc_epel_h_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); else put_hevc_epel_h_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); } inline void acceleration_functions::put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const { if (bit_depth <= 8) put_hevc_epel_v_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); else put_hevc_epel_v_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); } inline void acceleration_functions::put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) const { if (bit_depth <= 8) put_hevc_epel_hv_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); else put_hevc_epel_hv_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); } inline void acceleration_functions::put_hevc_qpel(int16_t *dst, ptrdiff_t dststride, const void *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer, int dX,int dY, int bit_depth) const { if (bit_depth <= 8) put_hevc_qpel_8[dX][dY](dst,dststride,(const uint8_t*)src,srcstride,width,height,mcbuffer); else put_hevc_qpel_16[dX][dY](dst,dststride,(const uint16_t*)src,srcstride,width,height,mcbuffer, bit_depth); } template <> inline void acceleration_functions::transform_skip(uint8_t *dst, const int16_t *coeffs,ptrdiff_t stride, int bit_depth) const { transform_skip_8(dst,coeffs,stride); } template <> inline void acceleration_functions::transform_skip(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_skip_16(dst,coeffs,stride, bit_depth); } template <> inline void acceleration_functions::transform_skip_rdpcm_v(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_v_8(dst,coeffs,nT,stride); } template <> inline void acceleration_functions::transform_skip_rdpcm_h(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_h_8(dst,coeffs,nT,stride); } template <> inline void acceleration_functions::transform_skip_rdpcm_v(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_v_8(dst,coeffs,nT,stride);*/ } template <> inline void acceleration_functions::transform_skip_rdpcm_h(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_h_8(dst,coeffs,nT,stride);*/ } template <> inline void acceleration_functions::transform_4x4_dst_add(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_8(dst,coeffs,stride); } template <> inline void acceleration_functions::transform_4x4_dst_add(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_16(dst,coeffs,stride,bit_depth); } template <> inline void acceleration_functions::transform_add(int sizeIdx, uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_8[sizeIdx](dst,coeffs,stride); } template <> inline void acceleration_functions::transform_add(int sizeIdx, uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_16[sizeIdx](dst,coeffs,stride,bit_depth); } template <> inline void acceleration_functions::add_residual(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_8(dst,stride,r,nT,bit_depth); } template <> inline void acceleration_functions::add_residual(uint16_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_16(dst,stride,r,nT,bit_depth); } #endif libde265-1.0.18/libde265/alloc_pool.cc000066400000000000000000000042121515675107500170760ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2014 struktur AG, Dirk Farin * * Authors: Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "libde265/alloc_pool.h" #include "libde265/util.h" #include #include #define DEBUG_MEMORY 1 alloc_pool::alloc_pool(size_t objSize, int poolSize, bool grow) : mObjSize(objSize), mPoolSize(poolSize), mGrow(grow) { m_freeList.reserve(poolSize); m_memBlocks.reserve(8); add_memory_block(); } void alloc_pool::add_memory_block() { uint8_t* p = new uint8_t[mObjSize * mPoolSize]; m_memBlocks.push_back(p); for (int i=0;i * * Authors: Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef ALLOC_POOL_H #define ALLOC_POOL_H #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include class alloc_pool { public: alloc_pool(size_t objSize, int poolSize=1000, bool grow=true); ~alloc_pool(); void* new_obj(const size_t size); void delete_obj(void*); void purge(); private: size_t mObjSize; int mPoolSize; bool mGrow; std::vector m_memBlocks; std::vector m_freeList; void add_memory_block(); }; #endif libde265-1.0.18/libde265/arm/000077500000000000000000000000001515675107500152245ustar00rootroot00000000000000libde265-1.0.18/libde265/arm/CMakeLists.txt000066400000000000000000000006471515675107500177730ustar00rootroot00000000000000add_library(arm OBJECT arm.cc arm.h) if(HAVE_NEON) add_library(arm_neon OBJECT cpudetect.S hevcdsp_qpel_neon.S ) target_compile_options(arm_neon PRIVATE -mfpu=neon -DHAVE_NEON -DEXTERN_ASM= -DHAVE_AS_FUNC -DHAVE_SECTION_DATA_REL_RO ) set(ARM_OBJECTS $ $ PARENT_SCOPE) else() set(ARM_OBJECTS $ PARENT_SCOPE) endif() libde265-1.0.18/libde265/arm/arm.cc000066400000000000000000000076561515675107500163300ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2015 struktur AG, Joachim Bauch * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "arm.h" #ifdef HAVE_NEON #define QPEL_FUNC(name) \ extern "C" void ff_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \ int height, int width); \ void libde265_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \ int width, int height, int16_t* mcbuffer) { \ ff_##name(dst, dststride, src, srcstride, height, width); \ } QPEL_FUNC(hevc_put_qpel_v1_neon_8); QPEL_FUNC(hevc_put_qpel_v2_neon_8); QPEL_FUNC(hevc_put_qpel_v3_neon_8); QPEL_FUNC(hevc_put_qpel_h1_neon_8); QPEL_FUNC(hevc_put_qpel_h2_neon_8); QPEL_FUNC(hevc_put_qpel_h3_neon_8); QPEL_FUNC(hevc_put_qpel_h1v1_neon_8); QPEL_FUNC(hevc_put_qpel_h1v2_neon_8); QPEL_FUNC(hevc_put_qpel_h1v3_neon_8); QPEL_FUNC(hevc_put_qpel_h2v1_neon_8); QPEL_FUNC(hevc_put_qpel_h2v2_neon_8); QPEL_FUNC(hevc_put_qpel_h2v3_neon_8); QPEL_FUNC(hevc_put_qpel_h3v1_neon_8); QPEL_FUNC(hevc_put_qpel_h3v2_neon_8); QPEL_FUNC(hevc_put_qpel_h3v3_neon_8); #undef QPEL_FUNC #if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H) #include #include extern "C" void libde265_detect_neon(void); static jmp_buf jump_env; static void sighandler(int sig) { (void)sig; longjmp(jump_env, 1); } static bool has_NEON() { static bool checked_NEON = false; static bool have_NEON = false; if (!checked_NEON) { void (*oldsignal)(int); checked_NEON = true; oldsignal = signal(SIGILL, sighandler); if (setjmp(jump_env)) { signal(SIGILL, oldsignal); have_NEON = false; return false; } libde265_detect_neon(); signal(SIGILL, oldsignal); have_NEON = true; } return have_NEON; } #else // #if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H) #warning "Don't know how to detect NEON support at runtime- will be disabled" static bool has_NEON() { return false; } #endif #endif // #ifdef HAVE_NEON void init_acceleration_functions_arm(struct acceleration_functions* accel) { #ifdef HAVE_NEON if (has_NEON()) { accel->put_hevc_qpel_8[0][1] = libde265_hevc_put_qpel_v1_neon_8; accel->put_hevc_qpel_8[0][2] = libde265_hevc_put_qpel_v2_neon_8; accel->put_hevc_qpel_8[0][3] = libde265_hevc_put_qpel_v3_neon_8; accel->put_hevc_qpel_8[1][0] = libde265_hevc_put_qpel_h1_neon_8; accel->put_hevc_qpel_8[1][1] = libde265_hevc_put_qpel_h1v1_neon_8; accel->put_hevc_qpel_8[1][2] = libde265_hevc_put_qpel_h1v2_neon_8; accel->put_hevc_qpel_8[1][3] = libde265_hevc_put_qpel_h1v3_neon_8; accel->put_hevc_qpel_8[2][0] = libde265_hevc_put_qpel_h2_neon_8; accel->put_hevc_qpel_8[2][1] = libde265_hevc_put_qpel_h2v1_neon_8; accel->put_hevc_qpel_8[2][2] = libde265_hevc_put_qpel_h2v2_neon_8; accel->put_hevc_qpel_8[2][3] = libde265_hevc_put_qpel_h2v3_neon_8; accel->put_hevc_qpel_8[3][0] = libde265_hevc_put_qpel_h3_neon_8; accel->put_hevc_qpel_8[3][1] = libde265_hevc_put_qpel_h3v1_neon_8; accel->put_hevc_qpel_8[3][2] = libde265_hevc_put_qpel_h3v2_neon_8; accel->put_hevc_qpel_8[3][3] = libde265_hevc_put_qpel_h3v3_neon_8; } #endif // #ifdef HAVE_NEON } libde265-1.0.18/libde265/arm/arm.h000066400000000000000000000017201515675107500161540ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2015 struktur AG, Joachim Bauch * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef LIBDE265_ARM_H #define LIBDE265_ARM_H #include "acceleration.h" void init_acceleration_functions_arm(struct acceleration_functions* accel); #endif // LIBDE265_ARM_H libde265-1.0.18/libde265/arm/asm.S000066400000000000000000000175431515675107500161420ustar00rootroot00000000000000/* * Copyright (c) 2008 Mans Rullgard * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #ifdef __ELF__ # define ELF #else # define ELF @ #endif #if CONFIG_THUMB # define A @ # define T #else # define A # define T @ #endif #if HAVE_AS_FUNC # define FUNC #else # define FUNC @ #endif #if HAVE_NEON .arch armv7-a #elif HAVE_ARMV6T2 .arch armv6t2 #elif HAVE_ARMV6 .arch armv6 #elif HAVE_ARMV5TE .arch armv5te #endif #if HAVE_NEON .fpu neon #elif HAVE_VFP .fpu vfp #endif .syntax unified T .thumb ELF .eabi_attribute 25, 1 @ Tag_ABI_align_preserved ELF .section .note.GNU-stack,"",%progbits @ Mark stack as non-executable .macro function name, export=0, align=2 .set .Lpic_idx, 0 .set .Lpic_gp, 0 .macro endfunc .if .Lpic_idx .align 2 .altmacro put_pic %(.Lpic_idx - 1) .noaltmacro .endif ELF .size \name, . - \name FUNC .endfunc .purgem endfunc .endm .text .align \align .if \export .global EXTERN_ASM\name ELF .type EXTERN_ASM\name, %function FUNC .func EXTERN_ASM\name EXTERN_ASM\name: .else ELF .type \name, %function FUNC .func \name \name: .endif .endm .macro const name, align=2, relocate=0 .macro endconst ELF .size \name, . - \name .purgem endconst .endm .if HAVE_SECTION_DATA_REL_RO && \relocate .section .data.rel.ro .else .section .rodata .endif .align \align \name: .endm #if !HAVE_ARMV6T2_EXTERNAL .macro movw rd, val mov \rd, \val & 255 orr \rd, \val & ~255 .endm #endif .macro mov32 rd, val #if HAVE_ARMV6T2_EXTERNAL movw \rd, #(\val) & 0xffff .if (\val) >> 16 movt \rd, #(\val) >> 16 .endif #else ldr \rd, =\val #endif .endm .macro put_pic num put_pic_\num .endm .macro do_def_pic num, val, label .macro put_pic_\num .if \num .altmacro put_pic %(\num - 1) .noaltmacro .endif \label: .word \val .purgem put_pic_\num .endm .endm .macro def_pic val, label .altmacro do_def_pic %.Lpic_idx, \val, \label .noaltmacro .set .Lpic_idx, .Lpic_idx + 1 .endm .macro ldpic rd, val, indir=0 ldr \rd, .Lpicoff\@ .Lpic\@: .if \indir A ldr \rd, [pc, \rd] T add \rd, pc T ldr \rd, [\rd] .else add \rd, pc .endif def_pic \val - (.Lpic\@ + (8 >> CONFIG_THUMB)), .Lpicoff\@ .endm .macro movrel rd, val #if CONFIG_PIC ldpic \rd, \val #elif HAVE_ARMV6T2_EXTERNAL && !defined(__APPLE__) movw \rd, #:lower16:\val movt \rd, #:upper16:\val #else ldr \rd, =\val #endif .endm .macro movrelx rd, val, gp #if CONFIG_PIC && defined(__ELF__) .ifnb \gp .if .Lpic_gp .unreq gp .endif gp .req \gp ldpic gp, _GLOBAL_OFFSET_TABLE_ .elseif !.Lpic_gp gp .req r12 ldpic gp, _GLOBAL_OFFSET_TABLE_ .endif .set .Lpic_gp, 1 ldr \rd, .Lpicoff\@ ldr \rd, [gp, \rd] def_pic \val(GOT), .Lpicoff\@ #elif CONFIG_PIC && defined(__APPLE__) ldpic \rd, .Lpic\@, indir=1 .non_lazy_symbol_pointer .Lpic\@: .indirect_symbol \val .word 0 .text #else movrel \rd, \val #endif .endm .macro add_sh rd, rn, rm, sh:vararg A add \rd, \rn, \rm, \sh T mov \rm, \rm, \sh T add \rd, \rn, \rm .endm .macro ldr_pre rt, rn, rm:vararg A ldr \rt, [\rn, \rm]! T add \rn, \rn, \rm T ldr \rt, [\rn] .endm .macro ldr_dpre rt, rn, rm:vararg A ldr \rt, [\rn, -\rm]! T sub \rn, \rn, \rm T ldr \rt, [\rn] .endm .macro ldr_nreg rt, rn, rm:vararg A ldr \rt, [\rn, -\rm] T sub \rt, \rn, \rm T ldr \rt, [\rt] .endm .macro ldr_post rt, rn, rm:vararg A ldr \rt, [\rn], \rm T ldr \rt, [\rn] T add \rn, \rn, \rm .endm .macro ldrc_pre cc, rt, rn, rm:vararg A ldr\cc \rt, [\rn, \rm]! T itt \cc T add\cc \rn, \rn, \rm T ldr\cc \rt, [\rn] .endm .macro ldrd_reg rt, rt2, rn, rm A ldrd \rt, \rt2, [\rn, \rm] T add \rt, \rn, \rm T ldrd \rt, \rt2, [\rt] .endm .macro ldrd_post rt, rt2, rn, rm A ldrd \rt, \rt2, [\rn], \rm T ldrd \rt, \rt2, [\rn] T add \rn, \rn, \rm .endm .macro ldrh_pre rt, rn, rm A ldrh \rt, [\rn, \rm]! T add \rn, \rn, \rm T ldrh \rt, [\rn] .endm .macro ldrh_dpre rt, rn, rm A ldrh \rt, [\rn, -\rm]! T sub \rn, \rn, \rm T ldrh \rt, [\rn] .endm .macro ldrh_post rt, rn, rm A ldrh \rt, [\rn], \rm T ldrh \rt, [\rn] T add \rn, \rn, \rm .endm .macro ldrb_post rt, rn, rm A ldrb \rt, [\rn], \rm T ldrb \rt, [\rn] T add \rn, \rn, \rm .endm .macro str_post rt, rn, rm:vararg A str \rt, [\rn], \rm T str \rt, [\rn] T add \rn, \rn, \rm .endm .macro strb_post rt, rn, rm:vararg A strb \rt, [\rn], \rm T strb \rt, [\rn] T add \rn, \rn, \rm .endm .macro strd_post rt, rt2, rn, rm A strd \rt, \rt2, [\rn], \rm T strd \rt, \rt2, [\rn] T add \rn, \rn, \rm .endm .macro strh_pre rt, rn, rm A strh \rt, [\rn, \rm]! T add \rn, \rn, \rm T strh \rt, [\rn] .endm .macro strh_dpre rt, rn, rm A strh \rt, [\rn, -\rm]! T sub \rn, \rn, \rm T strh \rt, [\rn] .endm .macro strh_post rt, rn, rm A strh \rt, [\rn], \rm T strh \rt, [\rn] T add \rn, \rn, \rm .endm .macro strh_dpost rt, rn, rm A strh \rt, [\rn], -\rm T strh \rt, [\rn] T sub \rn, \rn, \rm .endm #if HAVE_VFP_ARGS ELF .eabi_attribute 28, 1 # define VFP # define NOVFP @ #else # define VFP @ # define NOVFP #endif #define GLUE(a, b) a ## b #define JOIN(a, b) GLUE(a, b) #define X(s) JOIN(EXTERN_ASM, s) libde265-1.0.18/libde265/arm/cpudetect.S000066400000000000000000000020041515675107500173240ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2015 struktur AG, Joachim Bauch * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "asm.S" #include "neon.S" // we execute a simple NEON instruction and check if SIGILL is triggered to // detect if the CPU support NEON code function libde265_detect_neon, export=1 vand q0, q0, q0 bx lr endfunc libde265-1.0.18/libde265/arm/hevcdsp_qpel_neon.S000066400000000000000000000664221515675107500210560ustar00rootroot00000000000000/* * Copyright (c) 2014 - 2015 Seppo Tomperi * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ /* * This is commit 63ca0fe8288dbd300c9bb814cb671e5d889f691c from * https://github.com/FFmpeg/FFmpeg/blob/master/libavcodec/arm/hevcdsp_qpel_neon.S */ #include "asm.S" #include "neon.S" #define MAX_PB_SIZE #64 .macro regshuffle_d8 vmov d16, d17 vmov d17, d18 vmov d18, d19 vmov d19, d20 vmov d20, d21 vmov d21, d22 vmov d22, d23 .endm .macro regshuffle_q8 vmov q0, q1 vmov q1, q2 vmov q2, q3 vmov q3, q4 vmov q4, q5 vmov q5, q6 vmov q6, q7 .endm .macro vextin8 pld [r2] vld1.8 {q11}, [r2], r3 vext.8 d16, d22, d23, #1 vext.8 d17, d22, d23, #2 vext.8 d18, d22, d23, #3 vext.8 d19, d22, d23, #4 vext.8 d20, d22, d23, #5 vext.8 d21, d22, d23, #6 vext.8 d22, d22, d23, #7 .endm .macro loadin8 pld [r2] vld1.8 {d16}, [r2], r3 pld [r2] vld1.8 {d17}, [r2], r3 pld [r2] vld1.8 {d18}, [r2], r3 pld [r2] vld1.8 {d19}, [r2], r3 pld [r2] vld1.8 {d20}, [r2], r3 pld [r2] vld1.8 {d21}, [r2], r3 pld [r2] vld1.8 {d22}, [r2], r3 pld [r2] vld1.8 {d23}, [r2], r3 .endm .macro qpel_filter_1_32b vmov.i16 d16, #58 vmov.i16 d17, #10 vmull.s16 q9, d6, d16 // 58 * d0 vmull.s16 q10, d7, d16 // 58 * d1 vmov.i16 d16, #17 vmull.s16 q11, d4, d17 // 10 * c0 vmull.s16 q12, d5, d17 // 10 * c1 vmov.i16 d17, #5 vmull.s16 q13, d8, d16 // 17 * e0 vmull.s16 q14, d9, d16 // 17 * e1 vmull.s16 q15, d10, d17 // 5 * f0 vmull.s16 q8, d11, d17 // 5 * f1 vsub.s32 q9, q11 // 58 * d0 - 10 * c0 vsub.s32 q10, q12 // 58 * d1 - 10 * c1 vshll.s16 q11, d2, #2 // 4 * b0 vshll.s16 q12, d3, #2 // 4 * b1 vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 vsubl.s16 q13, d12, d0 // g0 - a0 vsubl.s16 q14, d13, d1 // g1 - a1 vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 vsub.s32 q13, q15 // g0 - a0 - 5 * f0 vsub.s32 q14, q8 // g1 - a1 - 5 * f1 vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 vqshrn.s32 d16, q9, #6 vqshrn.s32 d17, q10, #6 .endm // input q0 - q7 // output q8 .macro qpel_filter_2_32b vmov.i32 q8, #11 vaddl.s16 q9, d6, d8 // d0 + e0 vaddl.s16 q10, d7, d9 // d1 + e1 vaddl.s16 q11, d4, d10 // c0 + f0 vaddl.s16 q12, d5, d11 // c1 + f1 vmul.s32 q11, q8 // 11 * (c0 + f0) vmul.s32 q12, q8 // 11 * (c1 + f1) vmov.i32 q8, #40 vaddl.s16 q15, d2, d12 // b0 + g0 vmul.s32 q9, q8 // 40 * (d0 + e0) vmul.s32 q10, q8 // 40 * (d1 + e1) vaddl.s16 q8, d3, d13 // b1 + g1 vaddl.s16 q13, d0, d14 // a0 + h0 vaddl.s16 q14, d1, d15 // a1 + h1 vshl.s32 q15, #2 // 4*(b0+g0) vshl.s32 q8, #2 // 4*(b1+g1) vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0 vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1 vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0) vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1) vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0) vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) vqshrn.s32 d16, q9, #6 vqshrn.s32 d17, q10, #6 .endm .macro qpel_filter_3_32b vmov.i16 d16, #58 vmov.i16 d17, #10 vmull.s16 q9, d8, d16 // 58 * d0 vmull.s16 q10, d9, d16 // 58 * d1 vmov.i16 d16, #17 vmull.s16 q11, d10, d17 // 10 * c0 vmull.s16 q12, d11, d17 // 10 * c1 vmov.i16 d17, #5 vmull.s16 q13, d6, d16 // 17 * e0 vmull.s16 q14, d7, d16 // 17 * e1 vmull.s16 q15, d4, d17 // 5 * f0 vmull.s16 q8, d5, d17 // 5 * f1 vsub.s32 q9, q11 // 58 * d0 - 10 * c0 vsub.s32 q10, q12 // 58 * d1 - 10 * c1 vshll.s16 q11, d12, #2 // 4 * b0 vshll.s16 q12, d13, #2 // 4 * b1 vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 vsubl.s16 q13, d2, d14 // g0 - a0 vsubl.s16 q14, d3, d15 // g1 - a1 vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 vsub.s32 q13, q15 // g0 - a0 - 5 * f0 vsub.s32 q14, q8 // g1 - a1 - 5 * f1 vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 vqshrn.s32 d16, q9, #6 vqshrn.s32 d17, q10, #6 .endm .macro qpel_filter_1 out=q7 vmov.u8 d24, #58 vmov.u8 d25, #10 vshll.u8 q13, d20, #4 // 16*e vshll.u8 q14, d21, #2 // 4*f vmull.u8 \out, d19, d24 // 58*d vaddw.u8 q13, q13, d20 // 17*e vmull.u8 q15, d18, d25 // 10*c vaddw.u8 q14, q14, d21 // 5*f vsubl.u8 q12, d22, d16 // g - a vadd.u16 \out, q13 // 58d + 17e vshll.u8 q13, d17, #2 // 4*b vadd.u16 q15, q14 // 10*c + 5*f vadd.s16 q13, q12 // - a + 4*b + g vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f .endm .macro qpel_filter_2 out=q7 vmov.i16 q12, #10 vmov.i16 q14, #11 vaddl.u8 q13, d19, d20 // d + e vaddl.u8 q15, d18, d21 // c + f vmul.u16 q13, q12 // 10 * (d+e) vmul.u16 q15, q14 // 11 * ( c + f) vaddl.u8 \out, d17, d22 // b + g vaddl.u8 q12, d16, d23 // a + h vadd.u16 \out, q13 // b + 10 * (d + e) + g vadd.s16 q12, q15 vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g) vsub.s16 \out, q12 .endm .macro qpel_filter_3 out=q7 vmov.u8 d24, #58 vmov.u8 d25, #10 vshll.u8 q13, d19, #4 // 16*e vshll.u8 q14, d18, #2 // 4*f vmull.u8 \out, d20, d24 // 58*d vaddw.u8 q13, q13, d19 // 17*e vmull.u8 q15, d21, d25 // 10*c vaddw.u8 q14, q14, d18 // 5*f vsubl.u8 q12, d17, d23 // g - a vadd.u16 \out, q13 // 58d + 17e vshll.u8 q13, d22, #2 // 4*b vadd.u16 q15, q14 // 10*c + 5*f vadd.s16 q13, q12 // - a + 4*b + g vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f .endm .macro hevc_put_qpel_vX_neon_8 filter push {r4, r5, r6, r7} ldr r4, [sp, #16] // height ldr r5, [sp, #20] // width vpush {d8-d15} sub r2, r2, r3, lsl #1 sub r2, r3 mov r12, r4 mov r6, r0 mov r7, r2 lsl r1, #1 0: loadin8 cmp r5, #4 beq 4f 8: subs r4, #1 \filter vst1.16 {q7}, [r0], r1 regshuffle_d8 vld1.8 {d23}, [r2], r3 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #16 mov r0, r6 add r7, #8 mov r2, r7 b 0b 4: subs r4, #1 \filter vst1.16 d14, [r0], r1 regshuffle_d8 vld1.32 {d23[0]}, [r2], r3 bne 4b 99: vpop {d8-d15} pop {r4, r5, r6, r7} bx lr .endm .macro hevc_put_qpel_uw_vX_neon_8 filter push {r4-r10} ldr r5, [sp, #28] // width ldr r4, [sp, #32] // height ldr r8, [sp, #36] // src2 ldr r9, [sp, #40] // src2stride vpush {d8-d15} sub r2, r2, r3, lsl #1 sub r2, r3 mov r12, r4 mov r6, r0 mov r7, r2 cmp r8, #0 bne .Lbi\@ 0: loadin8 cmp r5, #4 beq 4f 8: subs r4, #1 \filter vqrshrun.s16 d0, q7, #6 vst1.8 d0, [r0], r1 regshuffle_d8 vld1.8 {d23}, [r2], r3 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #8 mov r0, r6 add r7, #8 mov r2, r7 b 0b 4: subs r4, #1 \filter vqrshrun.s16 d0, q7, #6 vst1.32 d0[0], [r0], r1 regshuffle_d8 vld1.32 {d23[0]}, [r2], r3 bne 4b b 99f .Lbi\@: lsl r9, #1 mov r10, r8 0: loadin8 cmp r5, #4 beq 4f 8: subs r4, #1 \filter vld1.16 {q0}, [r8], r9 vqadd.s16 q0, q7 vqrshrun.s16 d0, q0, #7 vst1.8 d0, [r0], r1 regshuffle_d8 vld1.8 {d23}, [r2], r3 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #8 mov r0, r6 add r10, #16 mov r8, r10 add r7, #8 mov r2, r7 b 0b 4: subs r4, #1 \filter vld1.16 d0, [r8], r9 vqadd.s16 d0, d14 vqrshrun.s16 d0, q0, #7 vst1.32 d0[0], [r0], r1 regshuffle_d8 vld1.32 {d23[0]}, [r2], r3 bne 4b 99: vpop {d8-d15} pop {r4-r10} bx lr .endm function ff_hevc_put_qpel_v1_neon_8, export=1 hevc_put_qpel_vX_neon_8 qpel_filter_1 endfunc function ff_hevc_put_qpel_v2_neon_8, export=1 hevc_put_qpel_vX_neon_8 qpel_filter_2 endfunc function ff_hevc_put_qpel_v3_neon_8, export=1 hevc_put_qpel_vX_neon_8 qpel_filter_3 endfunc function ff_hevc_put_qpel_uw_v1_neon_8, export=1 hevc_put_qpel_uw_vX_neon_8 qpel_filter_1 endfunc function ff_hevc_put_qpel_uw_v2_neon_8, export=1 hevc_put_qpel_uw_vX_neon_8 qpel_filter_2 endfunc function ff_hevc_put_qpel_uw_v3_neon_8, export=1 hevc_put_qpel_uw_vX_neon_8 qpel_filter_3 endfunc .macro hevc_put_qpel_hX_neon_8 filter push {r4, r5, r6, r7} ldr r4, [sp, #16] // height ldr r5, [sp, #20] // width vpush {d8-d15} sub r2, #4 lsl r1, #1 mov r12, r4 mov r6, r0 mov r7, r2 cmp r5, #4 beq 4f 8: subs r4, #1 vextin8 \filter vst1.16 {q7}, [r0], r1 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #16 mov r0, r6 add r7, #8 mov r2, r7 cmp r5, #4 bne 8b 4: subs r4, #1 vextin8 \filter vst1.16 d14, [r0], r1 bne 4b 99: vpop {d8-d15} pop {r4, r5, r6, r7} bx lr .endm .macro hevc_put_qpel_uw_hX_neon_8 filter push {r4-r10} ldr r5, [sp, #28] // width ldr r4, [sp, #32] // height ldr r8, [sp, #36] // src2 ldr r9, [sp, #40] // src2stride vpush {d8-d15} sub r2, #4 mov r12, r4 mov r6, r0 mov r7, r2 cmp r8, #0 bne .Lbi\@ cmp r5, #4 beq 4f 8: subs r4, #1 vextin8 \filter vqrshrun.s16 d0, q7, #6 vst1.8 d0, [r0], r1 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #8 mov r0, r6 add r7, #8 mov r2, r7 cmp r5, #4 bne 8b 4: subs r4, #1 vextin8 \filter vqrshrun.s16 d0, q7, #6 vst1.32 d0[0], [r0], r1 bne 4b b 99f .Lbi\@: lsl r9, #1 cmp r5, #4 beq 4f mov r10, r8 8: subs r4, #1 vextin8 \filter vld1.16 {q0}, [r8], r9 vqadd.s16 q0, q7 vqrshrun.s16 d0, q0, #7 vst1.8 d0, [r0], r1 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #8 add r10, #16 mov r8, r10 mov r0, r6 add r7, #8 mov r2, r7 cmp r5, #4 bne 8b 4: subs r4, #1 vextin8 \filter vld1.16 d0, [r8], r9 vqadd.s16 d0, d14 vqrshrun.s16 d0, q0, #7 vst1.32 d0[0], [r0], r1 bne 4b 99: vpop {d8-d15} pop {r4-r10} bx lr .endm function ff_hevc_put_qpel_h1_neon_8, export=1 hevc_put_qpel_hX_neon_8 qpel_filter_1 endfunc function ff_hevc_put_qpel_h2_neon_8, export=1 hevc_put_qpel_hX_neon_8 qpel_filter_2 endfunc function ff_hevc_put_qpel_h3_neon_8, export=1 hevc_put_qpel_hX_neon_8 qpel_filter_3 endfunc function ff_hevc_put_qpel_uw_h1_neon_8, export=1 hevc_put_qpel_uw_hX_neon_8 qpel_filter_1 endfunc function ff_hevc_put_qpel_uw_h2_neon_8, export=1 hevc_put_qpel_uw_hX_neon_8 qpel_filter_2 endfunc function ff_hevc_put_qpel_uw_h3_neon_8, export=1 hevc_put_qpel_uw_hX_neon_8 qpel_filter_3 endfunc .macro hevc_put_qpel_hXvY_neon_8 filterh filterv push {r4, r5, r6, r7} ldr r4, [sp, #16] // height ldr r5, [sp, #20] // width vpush {d8-d15} sub r2, #4 sub r2, r2, r3, lsl #1 sub r2, r3 // extra_before 3 lsl r1, #1 mov r12, r4 mov r6, r0 mov r7, r2 0: vextin8 \filterh q0 vextin8 \filterh q1 vextin8 \filterh q2 vextin8 \filterh q3 vextin8 \filterh q4 vextin8 \filterh q5 vextin8 \filterh q6 vextin8 \filterh q7 cmp r5, #4 beq 4f 8: subs r4, #1 \filterv vst1.16 {q8}, [r0], r1 regshuffle_q8 vextin8 \filterh q7 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #16 mov r0, r6 add r7, #8 mov r2, r7 b 0b 4: subs r4, #1 \filterv vst1.16 d16, [r0], r1 regshuffle_q8 vextin8 \filterh q7 bne 4b 99: vpop {d8-d15} pop {r4, r5, r6, r7} bx lr .endm .macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv push {r4-r10} ldr r5, [sp, #28] // width ldr r4, [sp, #32] // height ldr r8, [sp, #36] // src2 ldr r9, [sp, #40] // src2stride vpush {d8-d15} sub r2, #4 sub r2, r2, r3, lsl #1 sub r2, r3 // extra_before 3 mov r12, r4 mov r6, r0 mov r7, r2 cmp r8, #0 bne .Lbi\@ 0: vextin8 \filterh q0 vextin8 \filterh q1 vextin8 \filterh q2 vextin8 \filterh q3 vextin8 \filterh q4 vextin8 \filterh q5 vextin8 \filterh q6 vextin8 \filterh q7 cmp r5, #4 beq 4f 8: subs r4, #1 \filterv vqrshrun.s16 d0, q8, #6 vst1.8 d0, [r0], r1 regshuffle_q8 vextin8 \filterh q7 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #8 mov r0, r6 add r7, #8 mov r2, r7 b 0b 4: subs r4, #1 \filterv vqrshrun.s16 d0, q8, #6 vst1.32 d0[0], [r0], r1 regshuffle_q8 vextin8 \filterh q7 bne 4b b 99f .Lbi\@: lsl r9, #1 mov r10, r8 0: vextin8 \filterh q0 vextin8 \filterh q1 vextin8 \filterh q2 vextin8 \filterh q3 vextin8 \filterh q4 vextin8 \filterh q5 vextin8 \filterh q6 vextin8 \filterh q7 cmp r5, #4 beq 4f 8: subs r4, #1 \filterv vld1.16 {q0}, [r8], r9 vqadd.s16 q0, q8 vqrshrun.s16 d0, q0, #7 vst1.8 d0, [r0], r1 regshuffle_q8 vextin8 \filterh q7 bne 8b subs r5, #8 beq 99f mov r4, r12 add r6, #8 mov r0, r6 add r10, #16 mov r8, r10 add r7, #8 mov r2, r7 b 0b 4: subs r4, #1 \filterv vld1.16 d0, [r8], r9 vqadd.s16 d0, d16 vqrshrun.s16 d0, q0, #7 vst1.32 d0[0], [r0], r1 regshuffle_q8 vextin8 \filterh q7 bne 4b 99: vpop {d8-d15} pop {r4-r10} bx lr .endm function ff_hevc_put_qpel_h1v1_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b endfunc function ff_hevc_put_qpel_h2v1_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b endfunc function ff_hevc_put_qpel_h3v1_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b endfunc function ff_hevc_put_qpel_h1v2_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b endfunc function ff_hevc_put_qpel_h2v2_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b endfunc function ff_hevc_put_qpel_h3v2_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b endfunc function ff_hevc_put_qpel_h1v3_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b endfunc function ff_hevc_put_qpel_h2v3_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b endfunc function ff_hevc_put_qpel_h3v3_neon_8, export=1 hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b endfunc function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b endfunc function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b endfunc function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b endfunc function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b endfunc function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b endfunc function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b endfunc function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b endfunc function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b endfunc function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1 hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b endfunc .macro init_put_pixels pld [r1] pld [r1, r2] mov r12, MAX_PB_SIZE lsl r12, #1 .endm function ff_hevc_put_pixels_w2_neon_8, export=1 init_put_pixels vmov.u8 d5, #255 vshr.u64 d5, #32 0: subs r3, #1 vld1.32 {d0[0]}, [r1], r2 pld [r1] vld1.32 d6, [r0] vshll.u8 q0, d0, #6 vbit d6, d0, d5 vst1.32 d6, [r0], r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w4_neon_8, export=1 init_put_pixels 0: subs r3, #2 vld1.32 {d0[0]}, [r1], r2 vld1.32 {d0[1]}, [r1], r2 pld [r1] pld [r1, r2] vshll.u8 q0, d0, #6 vst1.64 {d0}, [r0], r12 vst1.64 {d1}, [r0], r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w6_neon_8, export=1 init_put_pixels vmov.u8 q10, #255 vshr.u64 d21, #32 0: subs r3, #1 vld1.16 {d0}, [r1], r2 pld [r1] vshll.u8 q0, d0, #6 vld1.8 {q12}, [r0] vbit q12, q0, q10 vst1.8 {q12}, [r0], r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w8_neon_8, export=1 init_put_pixels 0: subs r3, #2 vld1.8 {d0}, [r1], r2 vld1.8 {d2}, [r1], r2 pld [r1] pld [r1, r2] vshll.u8 q0, d0, #6 vshll.u8 q1, d2, #6 vst1.16 {q0}, [r0], r12 vst1.16 {q1}, [r0], r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w12_neon_8, export=1 init_put_pixels 0: subs r3, #2 vld1.64 {d0}, [r1] add r1, #8 vld1.32 {d1[0]}, [r1], r2 sub r1, #8 vld1.64 {d2}, [r1] add r1, #8 vld1.32 {d1[1]}, [r1], r2 sub r1, #8 pld [r1] pld [r1, r2] vshll.u8 q8, d0, #6 vshll.u8 q9, d1, #6 vshll.u8 q10, d2, #6 vmov d22, d19 vst1.64 {d16, d17, d18}, [r0], r12 vst1.64 {d20, d21, d22}, [r0], r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w16_neon_8, export=1 init_put_pixels 0: subs r3, #2 vld1.8 {q0}, [r1], r2 vld1.8 {q1}, [r1], r2 pld [r1] pld [r1, r2] vshll.u8 q8, d0, #6 vshll.u8 q9, d1, #6 vshll.u8 q10, d2, #6 vshll.u8 q11, d3, #6 vst1.8 {q8, q9}, [r0], r12 vst1.8 {q10, q11}, [r0], r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w24_neon_8, export=1 init_put_pixels 0: subs r3, #1 vld1.8 {d0, d1, d2}, [r1], r2 pld [r1] vshll.u8 q10, d0, #6 vshll.u8 q11, d1, #6 vshll.u8 q12, d2, #6 vstm r0, {q10, q11, q12} add r0, r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w32_neon_8, export=1 init_put_pixels 0: subs r3, #1 vld1.8 {q0, q1}, [r1], r2 pld [r1] vshll.u8 q8, d0, #6 vshll.u8 q9, d1, #6 vshll.u8 q10, d2, #6 vshll.u8 q11, d3, #6 vstm r0, {q8, q9, q10, q11} add r0, r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w48_neon_8, export=1 init_put_pixels 0: subs r3, #1 vld1.8 {q0, q1}, [r1] add r1, #32 vld1.8 {q2}, [r1], r2 sub r1, #32 pld [r1] vshll.u8 q8, d0, #6 vshll.u8 q9, d1, #6 vshll.u8 q10, d2, #6 vshll.u8 q11, d3, #6 vshll.u8 q12, d4, #6 vshll.u8 q13, d5, #6 vstm r0, {q8, q9, q10, q11, q12, q13} add r0, r12 bne 0b bx lr endfunc function ff_hevc_put_pixels_w64_neon_8, export=1 init_put_pixels 0: subs r3, #1 vld1.8 {q0, q1}, [r1] add r1, #32 vld1.8 {q2, q3}, [r1], r2 sub r1, #32 pld [r1] vshll.u8 q8, d0, #6 vshll.u8 q9, d1, #6 vshll.u8 q10, d2, #6 vshll.u8 q11, d3, #6 vshll.u8 q12, d4, #6 vshll.u8 q13, d5, #6 vshll.u8 q14, d6, #6 vshll.u8 q15, d7, #6 vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15} add r0, r12 bne 0b bx lr endfunc function ff_hevc_put_qpel_uw_pixels_neon_8, export=1 push {r4-r9} ldr r5, [sp, #24] // width ldr r4, [sp, #28] // height ldr r8, [sp, #32] // src2 ldr r9, [sp, #36] // src2stride vpush {d8-d15} cmp r8, #0 bne 2f 1: subs r4, #1 vld1.8 {d0}, [r2], r3 vst1.8 d0, [r0], r1 bne 1b vpop {d8-d15} pop {r4-r9} bx lr 2: subs r4, #1 vld1.8 {d0}, [r2], r3 vld1.16 {q1}, [r8], r9 vshll.u8 q0, d0, #6 vqadd.s16 q0, q1 vqrshrun.s16 d0, q0, #7 vst1.8 d0, [r0], r1 bne 2b vpop {d8-d15} pop {r4-r9} bx lr endfunc .macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4 function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 ldr r12, [sp] // height 1: subs r12, #4 vld1.32 {\regs} , [r2], r3 vld1.32 {\regs2} , [r2], r3 vld1.32 {\regs3} , [r2], r3 vld1.32 {\regs4} , [r2], r3 vst1.32 {\regs} , [r0], r1 vst1.32 {\regs2} , [r0], r1 vst1.32 {\regs3} , [r0], r1 vst1.32 {\regs4} , [r0], r1 bne 1b bx lr endfunc .endm .macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4 function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 push {r4-r5} ldr r12, [sp, #8] // height 1: subs r12, #2 mov r4, r2 vld1.32 {\regs} , [r2]! vld1.32 {\regs2} , [r2] add r2, r4, r3 mov r4, r2 vld1.32 {\regs3} , [r2]! vld1.32 {\regs4} , [r2] add r2, r4, r3 mov r5, r0 vst1.32 {\regs} , [r0]! vst1.32 {\regs2} , [r0] add r0, r5, r1 mov r5, r0 vst1.32 {\regs3} , [r0]! vst1.32 {\regs4} , [r0] add r0, r5, r1 bne 1b pop {r4-r5} bx lr endfunc .endm put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1] put_qpel_uw_pixels 8, d0, d1, d2, d3 put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0] put_qpel_uw_pixels 16, q0, q1, q2, q3 put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21 put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11 put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10 put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11 libde265-1.0.18/libde265/arm/neon.S000066400000000000000000000036671515675107500163230ustar00rootroot00000000000000/* * Copyright (c) 2008 Mans Rullgard * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 vtrn.32 \r0, \r4 vtrn.32 \r1, \r5 vtrn.32 \r2, \r6 vtrn.32 \r3, \r7 vtrn.16 \r0, \r2 vtrn.16 \r1, \r3 vtrn.16 \r4, \r6 vtrn.16 \r5, \r7 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 vtrn.8 \r4, \r5 vtrn.8 \r6, \r7 .endm .macro transpose_4x4 r0, r1, r2, r3 vtrn.16 \r0, \r2 vtrn.16 \r1, \r3 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 .endm .macro swap4 r0, r1, r2, r3, r4, r5, r6, r7 vswp \r0, \r4 vswp \r1, \r5 vswp \r2, \r6 vswp \r3, \r7 .endm .macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7 vtrn.32 \r0, \r2 vtrn.32 \r1, \r3 vtrn.32 \r4, \r6 vtrn.32 \r5, \r7 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 vtrn.16 \r4, \r5 vtrn.16 \r6, \r7 .endm libde265-1.0.18/libde265/bitstream.cc000066400000000000000000000063141515675107500167520ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "bitstream.h" #include "de265.h" #include #include #include bitreader::bitreader(unsigned char* buffer, int len) { data = buffer; bytes_remaining = len; } void bitreader::refill() { int shift = 64-nextbits_cnt; while (shift >= 8 && bytes_remaining) { uint64_t newval = *data++; bytes_remaining--; shift -= 8; newval <<= shift; nextbits |= newval; } nextbits_cnt = 64-shift; } uint32_t bitreader::get_bits(int n) { if (n == 0) return 0; assert(n<=32); if (nextbits_cnt < n) { refill(); } uint64_t val = nextbits; val >>= 64-n; nextbits <<= n; nextbits_cnt -= n; return val; } uint32_t bitreader::get_bits_fast(int n) { if (n == 0) return 0; assert(n<=32); assert(nextbits_cnt >= n); uint64_t val = nextbits; val >>= 64-n; nextbits <<= n; nextbits_cnt -= n; return val; } uint32_t bitreader::peek_bits(int n) { if (n == 0) return 0; assert(n<=32); if (nextbits_cnt < n) { refill(); } uint64_t val = nextbits; val >>= 64-n; return val; } void bitreader::skip_bits(int n) { if (nextbits_cnt < n) { refill(); } nextbits <<= n; nextbits_cnt -= n; } void bitreader::skip_bits_fast(int n) { nextbits <<= n; nextbits_cnt -= n; } void bitreader::skip_to_byte_boundary() { int nskip = (nextbits_cnt & 7); nextbits <<= nskip; nextbits_cnt -= nskip; } void bitreader::prepare_for_CABAC() { skip_to_byte_boundary(); int rewind = nextbits_cnt/8; data -= rewind; bytes_remaining += rewind; nextbits = 0; nextbits_cnt = 0; } uint32_t bitreader::get_uvlc() { int num_zeros=0; while (get_bits(1)==0) { num_zeros++; if (num_zeros > MAX_UVLC_LEADING_ZEROS) { return UVLC_ERROR; } } if (num_zeros != 0) { uint32_t offset = get_bits(num_zeros); uint32_t value = offset + (static_cast(1)<0); return value; } else { return 0; } } int32_t bitreader::get_svlc() { uint32_t v = get_uvlc(); if (v==0) return 0; if (v==UVLC_ERROR) return SVLC_ERROR; bool negative = ((v&1)==0); return negative ? -static_cast(v/2) : static_cast((v+1)/2); } bool bitreader::check_rbsp_trailing_bits() { int stop_bit = get_bits(1); assert(stop_bit==1); (void)stop_bit; while (nextbits_cnt>0 || bytes_remaining>0) { int filler = get_bits(1); if (filler!=0) { return false; } } return true; } libde265-1.0.18/libde265/bitstream.h000066400000000000000000000036711515675107500166170ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_BITSTREAM_H #define DE265_BITSTREAM_H #include // HEVC (ITU-T H.265, E.3.3) allows ue(v) values up to 2^32-2 (e.g. bit_rate_value_minus1), // which requires 31 leading zeros in the exp-Golomb code. 32 leading zeros would give a // minimum codeNum of 2^32-1, which exceeds every syntax element's valid range. constexpr int MAX_UVLC_LEADING_ZEROS = 31; constexpr uint32_t UVLC_ERROR = UINT32_MAX; constexpr int32_t SVLC_ERROR = INT32_MIN; class bitreader { public: bitreader() = default; bitreader(unsigned char* buffer, int len); uint32_t get_bits(int n); // n in [0;32] uint32_t get_bits_fast(int n); // n in [0;32] uint32_t peek_bits(int n); void skip_bits(int n); void skip_bits_fast(int n); void skip_to_byte_boundary(); void prepare_for_CABAC(); uint32_t get_uvlc(); // may return UVLC_ERROR int32_t get_svlc(); // may return SVLC_ERROR bool check_rbsp_trailing_bits(); // return true if remaining filler bits are all zero uint8_t* data = nullptr; int bytes_remaining = 0; private: void refill(); // refill to at least 56+1 bits uint64_t nextbits = 0; // left-aligned bits int nextbits_cnt = 0; }; #endif libde265-1.0.18/libde265/cabac.cc000066400000000000000000000635571515675107500160250ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "cabac.h" #include "util.h" #include #include #include #include #define INITIAL_CABAC_BUFFER_CAPACITY 4096 static const uint8_t LPS_table[64][4] = { { 128, 176, 208, 240}, { 128, 167, 197, 227}, { 128, 158, 187, 216}, { 123, 150, 178, 205}, { 116, 142, 169, 195}, { 111, 135, 160, 185}, { 105, 128, 152, 175}, { 100, 122, 144, 166}, { 95, 116, 137, 158}, { 90, 110, 130, 150}, { 85, 104, 123, 142}, { 81, 99, 117, 135}, { 77, 94, 111, 128}, { 73, 89, 105, 122}, { 69, 85, 100, 116}, { 66, 80, 95, 110}, { 62, 76, 90, 104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, { 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, { 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, { 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, { 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, { 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, { 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, { 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, { 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, { 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, { 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, { 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2} }; static const uint8_t renorm_table[32] = { 6, 5, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; static const uint8_t next_state_MPS[64] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48, 49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63 }; static const uint8_t next_state_LPS[64] = { 0,0,1,2,2,4,4,5,6,7,8,9,9,11,11,12, 13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24, 24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33, 33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63 }; #ifdef DE265_LOG_TRACE int logcnt=1; #endif void CABAC_decoder::init(uint8_t* bitstream, int length) { assert(length >= 0); bitstream_start = bitstream; bitstream_curr = bitstream; bitstream_end = bitstream+length; } void CABAC_decoder::init_CABAC() { int length = bitstream_end - bitstream_curr; range = 510; bits_needed = 8; value = 0; if (length>0) { value = (*bitstream_curr++) << 8; bits_needed-=8; } if (length>1) { value |= (*bitstream_curr++); bits_needed-=8; } logtrace(LogCABAC,"[%3d] init_CABAC_decode_2 r:%x v:%x\n", logcnt, range, value); } int CABAC_decoder::decode_bit(context_model* model) { logtrace(LogCABAC,"[%3d] decodeBin r:%x v:%x state:%d\n",logcnt,range, value, model->state); int decoded_bit; int LPS = LPS_table[model->state][ ( range >> 6 ) - 4 ]; range -= LPS; uint32_t scaled_range = range << 7; logtrace(LogCABAC,"[%3d] sr:%x v:%x\n",logcnt,scaled_range, value); if (value < scaled_range) { logtrace(LogCABAC,"[%3d] MPS\n",logcnt); // MPS path decoded_bit = model->MPSbit; model->state = next_state_MPS[model->state]; if (scaled_range < ( 256 << 7 ) ) { // scaled range, highest bit (15) not set range = scaled_range >> 6; // shift range by one bit value <<= 1; // shift value by one bit bits_needed++; if (bits_needed == 0) { bits_needed = -8; if (bitstream_curr < bitstream_end) { value |= *bitstream_curr++; } } } } else { logtrace(LogCABAC,"[%3d] LPS\n",logcnt); //printf("%d %d\n", model->state, 0); // LPS path value = (value - scaled_range); uint8_t num_bits = renorm_table[ LPS >> 3 ]; value <<= num_bits; range = LPS << num_bits; /* this is always >= 0x100 except for state 63, but state 63 is never used */ #ifndef NDEBUG int num_bitsTab = renorm_table[ LPS >> 3 ]; assert(num_bits == num_bitsTab); #endif decoded_bit = 1 - model->MPSbit; if (model->state==0) { model->MPSbit = 1-model->MPSbit; } model->state = next_state_LPS[model->state]; bits_needed += num_bits; if (bits_needed >= 0) { logtrace(LogCABAC,"bits_needed: %d\n", bits_needed); if (bitstream_curr < bitstream_end) { value |= (*bitstream_curr++) << bits_needed; } bits_needed -= 8; } } logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, decoded_bit, range, value); #ifdef DE265_LOG_TRACE logcnt++; #endif return decoded_bit; } int CABAC_decoder::decode_term_bit() { logtrace(LogCABAC,"CABAC term: range=%x\n", range); range -= 2; uint32_t scaledRange = range << 7; if (value >= scaledRange) { return 1; } else { // there is a while loop in the standard, but it will always be executed only once if (scaledRange < (256<<7)) { range = scaledRange >> 6; value *= 2; bits_needed++; if (bits_needed==0) { bits_needed = -8; if (bitstream_curr < bitstream_end) { value += (*bitstream_curr++); } } } return 0; } } // When we read past the end of the bitstream (which should only happen on faulty bitstreams), // we will eventually only return zeros. int CABAC_decoder::decode_bypass() { logtrace(LogCABAC,"[%3d] bypass r:%x v:%x\n",logcnt,range, value); value <<= 1; bits_needed++; if (bits_needed >= 0) { if (bitstream_end > bitstream_curr) { bits_needed = -8; value |= *bitstream_curr++; } else { // we read past the end of the bitstream, fill with 0 bits_needed = -8; } } int bit; uint32_t scaled_range = range << 7; if (value >= scaled_range) { value -= scaled_range; bit=1; } else { bit=0; } logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, bit, range, value); #ifdef DE265_LOG_TRACE logcnt++; #endif return bit; } int CABAC_decoder::decode_TU_bypass(int cMax) { for (int i=0;i= 0) { if (bitstream_end > bitstream_curr) { int input = *bitstream_curr++; input <<= bits_needed; bits_needed -= 8; value |= input; } } uint32_t scaled_range = range << 7; int v = value / scaled_range; if (unlikely(v>=(1< value %d r:%x v:%x\n", logcnt+nBits-1, v, range, value); #ifdef DE265_LOG_TRACE logcnt+=nBits; #endif return v; } uint32_t CABAC_decoder::decode_FL_bypass(int nBits) { uint32_t v=0; if (likely(nBits<=8)) { if (nBits==0) { return 0; } // we could use decode_bypass() for a single bit, but this seems to be slower #if 0 else if (nBits==1) { v = decode_bypass(); } #endif else { v = decode_FL_bypass_parallel(nBits); } } else { v = decode_FL_bypass_parallel(8); nBits-=8; while (nBits--) { v <<= 1; v |= decode_bypass(); } } logtrace(LogCABAC," -> FL: %d\n", v); return v; } int CABAC_decoder::decode_TR_bypass(int cRiceParam, int cTRMax) { int prefix = decode_TU_bypass(cTRMax>>cRiceParam); if (prefix==4) { // TODO check: constant 4 only works for coefficient decoding return cTRMax; } int suffix = decode_FL_bypass(cRiceParam); return (prefix << cRiceParam) | suffix; } uint32_t CABAC_decoder::decode_EGk_bypass(int k) { uint32_t base=0; int n=k; for (;;) { int bit = decode_bypass(); if (bit==0) break; else { if (n >= 31) { return 0; // TODO: error } base += 1u<=8) { append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF); vlc_buffer_len -= 8; } } void CABAC_encoder::write_uvlc(int value) { assert(value>=0); int nLeadingZeros=0; int base=0; int range=1; while (value>=base+range) { base += range; range <<= 1; nLeadingZeros++; } write_bits((1<0) write_uvlc(2*value-1); else write_uvlc(-2*value); } void CABAC_encoder_bitstream::flush_VLC() { // TODO: errors returned by append_byte() are ignored, resulting in a broken output. while (vlc_buffer_len>=8) { append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF); vlc_buffer_len -= 8; } if (vlc_buffer_len>0) { append_byte(vlc_buffer << (8-vlc_buffer_len)); vlc_buffer_len = 0; } vlc_buffer = 0; } void CABAC_encoder_bitstream::skip_bits(int nBits) { while (nBits>=8) { write_bits(0,8); nBits-=8; } if (nBits>0) { write_bits(0,nBits); } } int CABAC_encoder_bitstream::number_free_bits_in_byte() const { if ((vlc_buffer_len % 8)==0) return 0; return 8- (vlc_buffer_len % 8); } bool CABAC_encoder_bitstream::check_size_and_resize(int nBytes) { if (data_size+nBytes > data_capacity) { // 1 extra byte for stuffing if (data_capacity==0) { data_capacity = INITIAL_CABAC_BUFFER_CAPACITY; } else { data_capacity *= 2; } uint8_t* new_data_mem = (uint8_t*)realloc(data_mem,data_capacity); if (new_data_mem) { data_mem = new_data_mem; } else { return false; } } return true; } bool CABAC_encoder_bitstream::append_byte(int byte) { if (!check_size_and_resize(2)) { return false; } // --- emulation prevention --- /* These byte sequences may never occur in the bitstream: 0x000000 / 0x000001 / 0x000002 Hence, we have to add a 0x03 before the third byte. We also have to add a 0x03 for this sequence: 0x000003, because the escape byte itself also has to be escaped. */ // S0 --(0)--> S1 --(0)--> S2 --(0,1,2,3)--> add stuffing if (byte<=3) { /**/ if (state< 2 && byte==0) { state++; } else if (state==2 && byte<=3) { data_mem[ data_size++ ] = 3; if (byte==0) state=1; else state=0; } else { state=0; } } else { state=0; } // write actual data byte data_mem[ data_size++ ] = byte; return true; } bool CABAC_encoder_bitstream::write_startcode() { if (!check_size_and_resize(3)) { return false; } data_mem[ data_size+0 ] = 0; data_mem[ data_size+1 ] = 0; data_mem[ data_size+2 ] = 1; data_size+=3; return true; } void CABAC_encoder_bitstream::init_CABAC() { range = 510; low = 0; bits_left = 23; buffered_byte = 0xFF; num_buffered_bytes = 0; } void CABAC_encoder_bitstream::flush_CABAC() { // TODO: errors returned by append_byte() are ignored, resulting in a broken output. if (low >> (32 - bits_left)) { append_byte(buffered_byte + 1); while (num_buffered_bytes > 1) { append_byte(0x00); num_buffered_bytes--; } low -= 1 << (32 - bits_left); } else { if (num_buffered_bytes > 0) { append_byte(buffered_byte); } while (num_buffered_bytes > 1) { append_byte(0xff); num_buffered_bytes--; } } // printf("low: %08x nbits left:%d filled:%d\n",low,bits_left,32-bits_left); write_bits(low >> 8, 24-bits_left); } void CABAC_encoder_bitstream::write_out() { // TODO: errors returned by append_byte() are ignored, resulting in a broken output. //logtrace(LogCABAC,"low = %08x (bits_left=%d)\n",low,bits_left); int leadByte = low >> (24 - bits_left); bits_left += 8; low &= 0xffffffffu >> bits_left; //logtrace(LogCABAC,"write byte %02x\n",leadByte); //logtrace(LogCABAC,"-> low = %08x\n",low); if (leadByte == 0xff) { num_buffered_bytes++; } else { if (num_buffered_bytes > 0) { int carry = leadByte >> 8; int byte = buffered_byte + carry; buffered_byte = leadByte & 0xff; append_byte(byte); byte = ( 0xff + carry ) & 0xff; while ( num_buffered_bytes > 1 ) { append_byte(byte); num_buffered_bytes--; } } else { num_buffered_bytes = 1; buffered_byte = leadByte; } } } void CABAC_encoder_bitstream::testAndWriteOut() { // logtrace(LogCABAC,"bits_left = %d\n",bits_left); if (bits_left < 12) { write_out(); } } #ifdef DE265_LOG_TRACE int encBinCnt=1; #endif void CABAC_encoder_bitstream::write_CABAC_bit(int modelIdx, int bin) { context_model* model = &(*mCtxModels)[modelIdx]; //m_uiBinsCoded += m_binCountIncrement; //rcCtxModel.setBinsCoded( 1 ); logtrace(LogCABAC,"[%d] range=%x low=%x state=%d, bin=%d\n", encBinCnt, range,low, model->state,bin); /* printf("[%d] range=%x low=%x state=%d, bin=%d\n", encBinCnt, range,low, model->state,bin); printf("%d %d X\n",model->state,bin != model->MPSbit); */ #ifdef DE265_LOG_TRACE encBinCnt++; #endif uint32_t LPS = LPS_table[model->state][ ( range >> 6 ) - 4 ]; range -= LPS; if (bin != model->MPSbit) { //logtrace(LogCABAC,"LPS\n"); int num_bits = renorm_table[ LPS >> 3 ]; low = (low + range) << num_bits; range = LPS << num_bits; if (model->state==0) { model->MPSbit = 1-model->MPSbit; } model->state = next_state_LPS[model->state]; bits_left -= num_bits; } else { //logtrace(LogCABAC,"MPS\n"); model->state = next_state_MPS[model->state]; // renorm if (range >= 256) { return; } low <<= 1; range <<= 1; bits_left--; } testAndWriteOut(); } void CABAC_encoder_bitstream::write_CABAC_bypass(int bin) { logtrace(LogCABAC,"[%d] bypass = %d, range=%x\n",encBinCnt,bin,range); /* printf("[%d] bypass = %d, range=%x\n",encBinCnt,bin,range); printf("%d %d X\n",64, -1); */ #ifdef DE265_LOG_TRACE encBinCnt++; #endif // BinsCoded += m_binCountIncrement; low <<= 1; if (bin) { low += range; } bits_left--; testAndWriteOut(); } void CABAC_encoder::write_CABAC_TU_bypass(int value, int cMax) { for (int i=0;i0) { n--; write_CABAC_bypass(value & (1<= 256) { return; } else { low <<= 1; range <<= 1; bits_left--; } testAndWriteOut(); } static const uint32_t entropy_table[128] = { // -------------------- 200 -------------------- /* state= 0 */ 0x07d13 /* 0.977164 */, 0x08255 /* 1.018237 */, /* state= 1 */ 0x07738 /* 0.931417 */, 0x086ef /* 1.054179 */, /* state= 2 */ 0x0702b /* 0.876323 */, 0x0935a /* 1.151195 */, /* state= 3 */ 0x069e6 /* 0.827333 */, 0x09c7f /* 1.222650 */, /* state= 4 */ 0x062e8 /* 0.772716 */, 0x0a2c7 /* 1.271708 */, /* state= 5 */ 0x05c18 /* 0.719488 */, 0x0ae25 /* 1.360532 */, /* state= 6 */ 0x05632 /* 0.673414 */, 0x0b724 /* 1.430793 */, /* state= 7 */ 0x05144 /* 0.634904 */, 0x0c05d /* 1.502850 */, /* state= 8 */ 0x04bdf /* 0.592754 */, 0x0ccf2 /* 1.601145 */, /* state= 9 */ 0x0478d /* 0.559012 */, 0x0d57b /* 1.667843 */, /* state=10 */ 0x042ad /* 0.520924 */, 0x0de81 /* 1.738336 */, /* state=11 */ 0x03f4d /* 0.494564 */, 0x0e4b8 /* 1.786871 */, /* state=12 */ 0x03a9d /* 0.457945 */, 0x0f471 /* 1.909721 */, /* state=13 */ 0x037d5 /* 0.436201 */, 0x0fc56 /* 1.971385 */, /* state=14 */ 0x034c2 /* 0.412177 */, 0x10236 /* 2.017284 */, /* state=15 */ 0x031a6 /* 0.387895 */, 0x10d5c /* 2.104394 */, /* state=16 */ 0x02e62 /* 0.362383 */, 0x11b34 /* 2.212552 */, /* state=17 */ 0x02c20 /* 0.344752 */, 0x120b4 /* 2.255512 */, /* state=18 */ 0x029b8 /* 0.325943 */, 0x1294d /* 2.322672 */, /* state=19 */ 0x02791 /* 0.309143 */, 0x135e1 /* 2.420959 */, /* state=20 */ 0x02562 /* 0.292057 */, 0x13e37 /* 2.486077 */, /* state=21 */ 0x0230d /* 0.273846 */, 0x144fd /* 2.539000 */, /* state=22 */ 0x02193 /* 0.262308 */, 0x150c9 /* 2.631150 */, /* state=23 */ 0x01f5d /* 0.245026 */, 0x15ca0 /* 2.723641 */, /* state=24 */ 0x01de7 /* 0.233617 */, 0x162f9 /* 2.773246 */, /* state=25 */ 0x01c2f /* 0.220208 */, 0x16d99 /* 2.856259 */, /* state=26 */ 0x01a8e /* 0.207459 */, 0x17a93 /* 2.957634 */, /* state=27 */ 0x0195a /* 0.198065 */, 0x18051 /* 3.002477 */, /* state=28 */ 0x01809 /* 0.187778 */, 0x18764 /* 3.057759 */, /* state=29 */ 0x0164a /* 0.174144 */, 0x19460 /* 3.159206 */, /* state=30 */ 0x01539 /* 0.165824 */, 0x19f20 /* 3.243181 */, /* state=31 */ 0x01452 /* 0.158756 */, 0x1a465 /* 3.284334 */, /* state=32 */ 0x0133b /* 0.150261 */, 0x1b422 /* 3.407303 */, /* state=33 */ 0x0120c /* 0.140995 */, 0x1bce5 /* 3.475767 */, /* state=34 */ 0x01110 /* 0.133315 */, 0x1c394 /* 3.527962 */, /* state=35 */ 0x0104d /* 0.127371 */, 0x1d059 /* 3.627736 */, /* state=36 */ 0x00f8b /* 0.121451 */, 0x1d74b /* 3.681983 */, /* state=37 */ 0x00ef4 /* 0.116829 */, 0x1dfd0 /* 3.748540 */, /* state=38 */ 0x00e10 /* 0.109864 */, 0x1e6d3 /* 3.803335 */, /* state=39 */ 0x00d3f /* 0.103507 */, 0x1f925 /* 3.946462 */, /* state=40 */ 0x00cc4 /* 0.099758 */, 0x1fda7 /* 3.981667 */, /* state=41 */ 0x00c42 /* 0.095792 */, 0x203f8 /* 4.031012 */, /* state=42 */ 0x00b78 /* 0.089610 */, 0x20f7d /* 4.121014 */, /* state=43 */ 0x00afc /* 0.085830 */, 0x21dd6 /* 4.233102 */, /* state=44 */ 0x00a5e /* 0.081009 */, 0x22419 /* 4.282016 */, /* state=45 */ 0x00a1b /* 0.078950 */, 0x22a5e /* 4.331015 */, /* state=46 */ 0x00989 /* 0.074514 */, 0x23756 /* 4.432323 */, /* state=47 */ 0x0091b /* 0.071166 */, 0x24225 /* 4.516775 */, /* state=48 */ 0x008cf /* 0.068837 */, 0x2471a /* 4.555487 */, /* state=49 */ 0x00859 /* 0.065234 */, 0x25313 /* 4.649048 */, /* state=50 */ 0x00814 /* 0.063140 */, 0x25d67 /* 4.729721 */, /* state=51 */ 0x007b6 /* 0.060272 */, 0x2651f /* 4.790028 */, /* state=52 */ 0x0076e /* 0.058057 */, 0x2687c /* 4.816294 */, /* state=53 */ 0x00707 /* 0.054924 */, 0x27da7 /* 4.981661 */, /* state=54 */ 0x006d5 /* 0.053378 */, 0x28172 /* 5.011294 */, /* state=55 */ 0x00659 /* 0.049617 */, 0x28948 /* 5.072512 */, /* state=56 */ 0x00617 /* 0.047598 */, 0x297c5 /* 5.185722 */, /* state=57 */ 0x005dd /* 0.045814 */, 0x2a2df /* 5.272434 */, /* state=58 */ 0x005c1 /* 0.044965 */, 0x2a581 /* 5.293019 */, /* state=59 */ 0x00574 /* 0.042619 */, 0x2ad59 /* 5.354304 */, /* state=60 */ 0x0053b /* 0.040882 */, 0x2bba5 /* 5.465973 */, /* state=61 */ 0x0050c /* 0.039448 */, 0x2c596 /* 5.543651 */, /* state=62 */ 0x004e9 /* 0.038377 */, 0x2cd88 /* 5.605741 */, 0x00400 , 0x2d000 /* dummy, should never be used */ }; #if 0 static const uint32_t entropy_table_orig[128] = { 0x07b23, 0x085f9, 0x074a0, 0x08cbc, 0x06ee4, 0x09354, 0x067f4, 0x09c1b, 0x060b0, 0x0a62a, 0x05a9c, 0x0af5b, 0x0548d, 0x0b955, 0x04f56, 0x0c2a9, 0x04a87, 0x0cbf7, 0x045d6, 0x0d5c3, 0x04144, 0x0e01b, 0x03d88, 0x0e937, 0x039e0, 0x0f2cd, 0x03663, 0x0fc9e, 0x03347, 0x10600, 0x03050, 0x10f95, 0x02d4d, 0x11a02, 0x02ad3, 0x12333, 0x0286e, 0x12cad, 0x02604, 0x136df, 0x02425, 0x13f48, 0x021f4, 0x149c4, 0x0203e, 0x1527b, 0x01e4d, 0x15d00, 0x01c99, 0x166de, 0x01b18, 0x17017, 0x019a5, 0x17988, 0x01841, 0x18327, 0x016df, 0x18d50, 0x015d9, 0x19547, 0x0147c, 0x1a083, 0x0138e, 0x1a8a3, 0x01251, 0x1b418, 0x01166, 0x1bd27, 0x01068, 0x1c77b, 0x00f7f, 0x1d18e, 0x00eda, 0x1d91a, 0x00e19, 0x1e254, 0x00d4f, 0x1ec9a, 0x00c90, 0x1f6e0, 0x00c01, 0x1fef8, 0x00b5f, 0x208b1, 0x00ab6, 0x21362, 0x00a15, 0x21e46, 0x00988, 0x2285d, 0x00934, 0x22ea8, 0x008a8, 0x239b2, 0x0081d, 0x24577, 0x007c9, 0x24ce6, 0x00763, 0x25663, 0x00710, 0x25e8f, 0x006a0, 0x26a26, 0x00672, 0x26f23, 0x005e8, 0x27ef8, 0x005ba, 0x284b5, 0x0055e, 0x29057, 0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f, 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb }; const uint32_t entropy_table_theory[128] = { 0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f, 0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d, 0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c, 0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a, 0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759, 0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458, 0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156, 0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55 }; #endif void CABAC_encoder_estim::write_CABAC_bit(int modelIdx, int bit) { context_model* model = &(*mCtxModels)[modelIdx]; //printf("[%d] state=%d, bin=%d\n", encBinCnt, model->state,bit); //encBinCnt++; int idx = model->state<<1; if (bit==model->MPSbit) { model->state = next_state_MPS[model->state]; } else { idx++; if (model->state==0) { model->MPSbit = 1-model->MPSbit; } model->state = next_state_LPS[model->state]; } mFracBits += entropy_table[idx]; //printf("-> %08lx %f\n",entropy_table[idx], entropy_table[idx] / float(1<<15)); } float CABAC_encoder::RDBits_for_CABAC_bin(int modelIdx, int bit) { context_model* model = &(*mCtxModels)[modelIdx]; int idx = model->state<<1; if (bit!=model->MPSbit) { idx++; } return entropy_table[idx] / float(1<<15); } void CABAC_encoder::write_CABAC_EGk(int val, int k) { while (val >= ( 1 << k ) ) { write_CABAC_bypass(1); val = val - ( 1 << k ); k++; } write_CABAC_bypass(0); while (k) { k--; write_CABAC_bypass((val >> k) & 1); } } void CABAC_encoder_estim_constant::write_CABAC_bit(int modelIdx, int bit) { context_model* model = &(*mCtxModels)[modelIdx]; int idx = model->state<<1; if (bit!=model->MPSbit) { idx++; } mFracBits += entropy_table[idx]; } #if 0 void printtab(int idx,int s) { printf("%d %f %f %f\n", s, double(entropy_table[idx])/0x8000, double(entropy_table_orig[idx])/0x8000, double(entropy_table_f265[idx])/0x8000); } void plot_tables() { for (int i=-62;i<=0;i++) { int idx = -i *2; int s = i; printtab(idx,s); } for (int i=0;i<=62;i++) { int idx = 2*i +1; int s = i; printtab(idx,s); } } #endif libde265-1.0.18/libde265/cabac.h000066400000000000000000000127371515675107500156610ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_CABAC_H #define DE265_CABAC_H #include #include "contextmodel.h" class CABAC_decoder { public: void init(uint8_t* bitstream, int length); void init_CABAC(); int decode_bit(context_model* model); int decode_TU(int cMax, context_model* model); int decode_term_bit(); int decode_bypass(); int decode_TU_bypass(int cMax); uint32_t decode_FL_bypass(int nBits); int decode_TR_bypass(int cRiceParam, int cTRMax); uint32_t decode_EGk_bypass(int k); uint8_t* bitstream_start = nullptr; uint8_t* bitstream_curr = nullptr; uint8_t* bitstream_end = nullptr; private: uint32_t range = 0; uint32_t value = 0; int16_t bits_needed = 0; int decode_FL_bypass_parallel(int nBits); }; // --------------------------------------------------------------------------- class CABAC_encoder { public: CABAC_encoder() : mCtxModels(nullptr) { } virtual ~CABAC_encoder() { } virtual int size() const = 0; virtual void reset() = 0; // --- VLC --- virtual void write_bits(uint32_t bits,int n) = 0; virtual void write_bit(int bit) { write_bits(bit,1); } virtual void write_uvlc(int value); virtual void write_svlc(int value); virtual bool write_startcode() = 0; virtual void skip_bits(int nBits) = 0; virtual void add_trailing_bits(); virtual int number_free_bits_in_byte() const = 0; // output all remaining bits and fill with zeros to next byte boundary virtual void flush_VLC() { } // --- CABAC --- void set_context_models(context_model_table* models) { mCtxModels=models; } virtual void init_CABAC() { } virtual void write_CABAC_bit(int modelIdx, int bit) = 0; virtual void write_CABAC_bypass(int bit) = 0; virtual void write_CABAC_TU_bypass(int value, int cMax); virtual void write_CABAC_FL_bypass(int value, int nBits); virtual void write_CABAC_term_bit(int bit) = 0; virtual void flush_CABAC() { } void write_CABAC_EGk(int absolute_symbol, int k); // absolute_symbol >= 0 virtual bool modifies_context() const = 0; float RDBits_for_CABAC_bin(int modelIdx, int bit); protected: context_model_table* mCtxModels; }; class CABAC_encoder_bitstream : public CABAC_encoder { public: CABAC_encoder_bitstream(); ~CABAC_encoder_bitstream(); void reset() override; int size() const override { return data_size; } uint8_t* data() const { return data_mem; } // --- VLC --- void write_bits(uint32_t bits,int n) override; bool write_startcode() override; void skip_bits(int nBits) override; int number_free_bits_in_byte() const override; // output all remaining bits and fill with zeros to next byte boundary void flush_VLC() override; // --- CABAC --- void init_CABAC() override; void write_CABAC_bit(int modelIdx, int bit) override; void write_CABAC_bypass(int bit) override; void write_CABAC_term_bit(int bit) override; void flush_CABAC() override; bool modifies_context() const override { return true; } private: // data buffer uint8_t* data_mem = nullptr; uint32_t data_capacity = 0; uint32_t data_size = 0; char state = 0; // for inserting emulation-prevention bytes // VLC uint32_t vlc_buffer; uint32_t vlc_buffer_len = 0; // CABAC uint32_t range; uint32_t low; int8_t bits_left; uint8_t buffered_byte; uint16_t num_buffered_bytes; bool check_size_and_resize(int nBytes); void testAndWriteOut(); void write_out(); bool append_byte(int byte); }; class CABAC_encoder_estim : public CABAC_encoder { public: CABAC_encoder_estim() : mFracBits(0) { } void reset() override { mFracBits=0; } int size() const override { return mFracBits>>(15+3); } uint64_t getFracBits() const { return mFracBits; } float getRDBits() const { return mFracBits / float(1<<15); } // --- VLC --- void write_bits(uint32_t bits,int n) override { mFracBits += n<<15; } void write_bit(int bit) override { mFracBits+=1<<15; } bool write_startcode() override { mFracBits += (1<<15)*8*3; return true; } void skip_bits(int nBits) override { mFracBits += nBits<<15; } int number_free_bits_in_byte() const override { return 0; } // TODO, good enough for now // --- CABAC --- void write_CABAC_bit(int modelIdx, int bit) override; void write_CABAC_bypass(int bit) override { mFracBits += 0x8000; } void write_CABAC_FL_bypass(int value, int nBits) override { mFracBits += nBits<<15; } void write_CABAC_term_bit(int bit) override { /* not implemented (not needed) */ } bool modifies_context() const override { return true; } protected: uint64_t mFracBits; }; class CABAC_encoder_estim_constant : public CABAC_encoder_estim { public: void write_CABAC_bit(int modelIdx, int bit) override; bool modifies_context() const override { return false; } }; #endif libde265-1.0.18/libde265/contextmodel.cc000066400000000000000000000307141515675107500174660ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "slice.h" #include #include #include bool D = false; context_model_table::context_model_table(const context_model_table& src) { if (D) printf("%p c'tor = %p\n",static_cast(this),static_cast(&src)); if (src.refcnt) { (*(src.refcnt))++; } refcnt = src.refcnt; model = src.model; } context_model_table::~context_model_table() { if (D) printf("%p destructor\n",static_cast(this)); if (refcnt) { (*refcnt)--; if (*refcnt==0) { if (D) printf("mfree %p\n",static_cast(model)); delete[] model; delete refcnt; } } } void context_model_table::init(int initType, int QPY) { if (D) printf("%p init\n",static_cast(this)); decouple_or_alloc_with_empty_data(); initialize_CABAC_models(model, initType, QPY); } void context_model_table::release() { if (D) printf("%p release %p\n",static_cast(this),static_cast(refcnt)); if (!refcnt) { return; } // if (*refcnt == 1) { return; } <- keep memory for later, but does not work when we believe that we freed the memory and nulled all references (*refcnt)--; if (*refcnt==0) { delete[] model; delete refcnt; } model = nullptr; refcnt= nullptr; } void context_model_table::decouple() { if (D) printf("%p decouple (%p)\n",static_cast(this),static_cast(refcnt)); assert(refcnt); // not necessarily so, but we never use it on an uninitialized object if (*refcnt > 1) { (*refcnt)--; context_model* oldModel = model; model = new context_model[CONTEXT_MODEL_TABLE_LENGTH]; refcnt= new int; *refcnt=1; memcpy(model,oldModel,sizeof(context_model)*CONTEXT_MODEL_TABLE_LENGTH); } } context_model_table context_model_table::transfer() { context_model_table newtable; newtable.model = model; newtable.refcnt= refcnt; model =nullptr; refcnt=nullptr; return newtable; } context_model_table& context_model_table::operator=(const context_model_table& src) { if (D) printf("%p assign = %p\n",static_cast(this),static_cast(&src)); // assert(src.refcnt); // not necessarily so, but we never use it on an uninitialized object if (!src.refcnt) { release(); return *this; } (*(src.refcnt))++; release(); model = src.model; refcnt= src.refcnt; return *this; } bool context_model_table::operator==(const context_model_table& b) const { if (b.model == model) return true; if (b.model == nullptr || model == nullptr) return false; for (int i=0;i1); (*refcnt)--; } if (D) printf("%p (alloc)\n",static_cast(this)); model = new context_model[CONTEXT_MODEL_TABLE_LENGTH]; // Without initializing the model, we got an invalid model state during decoding (issue #236) memset(model, 0, sizeof(context_model) * CONTEXT_MODEL_TABLE_LENGTH); refcnt= new int; *refcnt=1; } static void set_initValue(int SliceQPY, context_model* model, int initValue, int nContexts) { int slopeIdx = initValue >> 4; int intersecIdx = initValue & 0xF; int m = slopeIdx*5 - 45; int n = (intersecIdx<<3) - 16; int preCtxState = Clip3(1,126, ((m*Clip3(0,51, SliceQPY))>>4)+n); // logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",SliceQPY,slopeIdx,intersecIdx,m,n); for (int i=0;i 0) { init_context(QPY, cm+CONTEXT_MODEL_CU_SKIP_FLAG, initValue_cu_skip_flag[initType-1], 3); init_context(QPY, cm+CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1); init_context(QPY, cm+CONTEXT_MODEL_MERGE_FLAG, &initValue_merge_flag[initType-1],1); init_context(QPY, cm+CONTEXT_MODEL_MERGE_IDX, &initValue_merge_idx[initType-1], 1); init_context(QPY, cm+CONTEXT_MODEL_INTER_PRED_IDC, initValue_inter_pred_idc, 5); init_context(QPY, cm+CONTEXT_MODEL_REF_IDX_LX, initValue_ref_idx_lX, 2); init_context(QPY, cm+CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2); init_context(QPY, cm+CONTEXT_MODEL_MVP_LX_FLAG, initValue_mvp_lx_flag, 1); init_context(QPY, cm+CONTEXT_MODEL_RQT_ROOT_CBF, initValue_rqt_root_cbf, 1); init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_FLAG, 139, 2); init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_DIR, 139, 2); } init_context(QPY, cm+CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3); init_context(QPY, cm+CONTEXT_MODEL_PART_MODE, &initValue_part_mode[(initType!=2 ? initType : 5)], 4); init_context(QPY, cm+CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1); init_context(QPY, cm+CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE, &initValue_intra_chroma_pred_mode[initType], 1); init_context(QPY, cm+CONTEXT_MODEL_CBF_LUMA, &initValue_cbf_luma[initType == 0 ? 0 : 2], 2); init_context(QPY, cm+CONTEXT_MODEL_CBF_CHROMA, &initValue_cbf_chroma[initType * 4], 4); init_context(QPY, cm+CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG, &initValue_split_transform_flag[initType * 3], 3); init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18); init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18); init_context(QPY, cm+CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG, &initValue_coded_sub_block_flag[initType * 4], 4); init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG, initValue_significant_coeff_flag[initType], 42); init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG+42, initValue_significant_coeff_flag_skipmode[initType], 2); init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG, &initValue_coeff_abs_level_greater1_flag[initType * 24], 24); init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG, &initValue_coeff_abs_level_greater2_flag[initType * 6], 6); init_context(QPY, cm+CONTEXT_MODEL_SAO_MERGE_FLAG, &initValue_sao_merge_leftUp_flag[initType], 1); init_context(QPY, cm+CONTEXT_MODEL_SAO_TYPE_IDX, &initValue_sao_type_idx_lumaChroma_flag[initType], 1); init_context(QPY, cm+CONTEXT_MODEL_CU_QP_DELTA_ABS, initValue_cu_qp_delta_abs, 2); init_context(QPY, cm+CONTEXT_MODEL_TRANSFORM_SKIP_FLAG, initValue_transform_skip_flag, 2); init_context(QPY, cm+CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1); init_context_const(QPY, cm+CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1, 154, 8); init_context_const(QPY, cm+CONTEXT_MODEL_RES_SCALE_SIGN_FLAG, 154, 2); init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG, 154, 1); init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX, 154, 1); } libde265-1.0.18/libde265/contextmodel.h000066400000000000000000000117161515675107500173310ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * Authors: struktur AG, Dirk Farin * Min Chen * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_CONTEXTMODEL_H #define DE265_CONTEXTMODEL_H #include "libde265/cabac.h" #include "libde265/de265.h" #include #include struct context_model { uint8_t MPSbit : 1; uint8_t state : 7; bool operator==(context_model b) const { return state==b.state && MPSbit==b.MPSbit; } bool operator!=(context_model b) const { return state!=b.state || MPSbit!=b.MPSbit; } }; enum context_model_index { // SAO CONTEXT_MODEL_SAO_MERGE_FLAG = 0, CONTEXT_MODEL_SAO_TYPE_IDX = CONTEXT_MODEL_SAO_MERGE_FLAG +1, // CB-tree CONTEXT_MODEL_SPLIT_CU_FLAG = CONTEXT_MODEL_SAO_TYPE_IDX + 1, CONTEXT_MODEL_CU_SKIP_FLAG = CONTEXT_MODEL_SPLIT_CU_FLAG + 3, // intra-prediction CONTEXT_MODEL_PART_MODE = CONTEXT_MODEL_CU_SKIP_FLAG + 3, CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG = CONTEXT_MODEL_PART_MODE + 4, CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE = CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG + 1, // transform-tree CONTEXT_MODEL_CBF_LUMA = CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE + 1, CONTEXT_MODEL_CBF_CHROMA = CONTEXT_MODEL_CBF_LUMA + 2, CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG = CONTEXT_MODEL_CBF_CHROMA + 4, CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG = CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + 3, CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG + 1, // residual CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX + 1, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX + 18, CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX + 18, CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG = CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + 4, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG = CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + 42+2, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + 24, CONTEXT_MODEL_CU_QP_DELTA_ABS = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + 6, CONTEXT_MODEL_TRANSFORM_SKIP_FLAG = CONTEXT_MODEL_CU_QP_DELTA_ABS + 2, CONTEXT_MODEL_RDPCM_FLAG = CONTEXT_MODEL_TRANSFORM_SKIP_FLAG + 2, CONTEXT_MODEL_RDPCM_DIR = CONTEXT_MODEL_RDPCM_FLAG + 2, // motion CONTEXT_MODEL_MERGE_FLAG = CONTEXT_MODEL_RDPCM_DIR + 2, CONTEXT_MODEL_MERGE_IDX = CONTEXT_MODEL_MERGE_FLAG + 1, CONTEXT_MODEL_PRED_MODE_FLAG = CONTEXT_MODEL_MERGE_IDX + 1, CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG = CONTEXT_MODEL_PRED_MODE_FLAG + 1, CONTEXT_MODEL_MVP_LX_FLAG = CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 2, CONTEXT_MODEL_RQT_ROOT_CBF = CONTEXT_MODEL_MVP_LX_FLAG + 1, CONTEXT_MODEL_REF_IDX_LX = CONTEXT_MODEL_RQT_ROOT_CBF + 1, CONTEXT_MODEL_INTER_PRED_IDC = CONTEXT_MODEL_REF_IDX_LX + 2, CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG = CONTEXT_MODEL_INTER_PRED_IDC + 5, CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 = CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG + 1, CONTEXT_MODEL_RES_SCALE_SIGN_FLAG = CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 + 8, CONTEXT_MODEL_TABLE_LENGTH = CONTEXT_MODEL_RES_SCALE_SIGN_FLAG + 2 }; void initialize_CABAC_models(context_model context_model_table[CONTEXT_MODEL_TABLE_LENGTH], int initType, int QPY); class context_model_table { public: context_model_table() = default; context_model_table(const context_model_table&); ~context_model_table(); void init(int initType, int QPY); void release(); void decouple(); context_model_table transfer(); context_model_table copy() const { context_model_table t=*this; t.decouple(); return t; } bool empty() const { return refcnt != nullptr; } context_model& operator[](int i) { return model[i]; } context_model_table& operator=(const context_model_table&); bool operator==(const context_model_table&) const; std::string debug_dump() const; private: void decouple_or_alloc_with_empty_data(); context_model* model = nullptr; // [CONTEXT_MODEL_TABLE_LENGTH] int* refcnt = nullptr; }; #endif libde265-1.0.18/libde265/de265-version.h.in000066400000000000000000000021611515675107500175330ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ /* de265-version.h * * This file was generated by cmake when libde265 was built. * * DO NOT EDIT THIS FILE. */ #ifndef LIBDE265_VERSION_H #define LIBDE265_VERSION_H /* Numeric representation of the version */ #define LIBDE265_NUMERIC_VERSION @NUMERIC_VERSION@ /* Version string */ #define LIBDE265_VERSION "@PACKAGE_VERSION@" #endif libde265-1.0.18/libde265/de265.cc000066400000000000000000000527061515675107500156130ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #define DEBUG_INSERT_STREAM_ERRORS 0 #include "de265.h" #include "decctx.h" #include "util.h" #include "scan.h" #include "image.h" #include "sei.h" #include #include #include #include // TODO: should be in some vps.c related header de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps); extern "C" { LIBDE265_API const char *de265_get_version(void) { return (LIBDE265_VERSION); } LIBDE265_API uint32_t de265_get_version_number(void) { return (LIBDE265_NUMERIC_VERSION); } static uint8_t bcd2dec(uint8_t v) { return (v>>4) * 10 + (v & 0x0F); } LIBDE265_API int de265_get_version_number_major(void) { return bcd2dec(((LIBDE265_NUMERIC_VERSION)>>24) & 0xFF); } LIBDE265_API int de265_get_version_number_minor(void) { return bcd2dec(((LIBDE265_NUMERIC_VERSION)>>16) & 0xFF); } LIBDE265_API int de265_get_version_number_maintenance(void) { return bcd2dec(((LIBDE265_NUMERIC_VERSION)>>8) & 0xFF); } LIBDE265_API const char* de265_get_error_text(de265_error err) { switch (err) { case DE265_OK: return "no error"; case DE265_ERROR_NO_SUCH_FILE: return "no such file"; //case DE265_ERROR_NO_STARTCODE: return "no startcode found"; //case DE265_ERROR_EOF: return "end of file"; case DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS: return "coefficient out of image bounds"; case DE265_ERROR_CHECKSUM_MISMATCH: return "image checksum mismatch"; case DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area"; case DE265_ERROR_OUT_OF_MEMORY: return "out of memory"; case DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE: return "coded parameter out of range"; case DE265_ERROR_IMAGE_BUFFER_FULL: return "DPB/output queue full"; case DE265_ERROR_CANNOT_START_THREADPOOL: return "cannot start decoding threads"; case DE265_ERROR_LIBRARY_INITIALIZATION_FAILED: return "global library initialization failed"; case DE265_ERROR_LIBRARY_NOT_INITIALIZED: return "cannot free library data (not initialized"; //case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED: // return "internal error: maximum number of thread contexts exceeded"; //case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED: // return "internal error: maximum number of slices exceeded"; case DE265_ERROR_NOT_IMPLEMENTED_YET: return "unimplemented decoder feature"; //case DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED: //return "scaling list not implemented"; case DE265_ERROR_WAITING_FOR_INPUT_DATA: return "no more input data, decoder stalled"; case DE265_ERROR_CANNOT_PROCESS_SEI: return "SEI data cannot be processed"; case DE265_ERROR_PARAMETER_PARSING: return "command-line parameter error"; case DE265_ERROR_NO_INITIAL_SLICE_HEADER: return "first slice missing, cannot decode dependent slice"; case DE265_ERROR_PREMATURE_END_OF_SLICE: return "premature end of slice data"; case DE265_ERROR_UNSPECIFIED_DECODING_ERROR: return "unspecified decoding error"; case DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING: return "Cannot run decoder multi-threaded because stream does not support WPP"; case DE265_WARNING_WARNING_BUFFER_FULL: return "Too many warnings queued"; case DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT: return "Premature end of slice segment"; case DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET: return "Incorrect entry-point offsets"; case DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area (concealing stream error...)"; case DE265_WARNING_SPS_HEADER_INVALID: return "sps header invalid"; case DE265_WARNING_PPS_HEADER_INVALID: return "pps header invalid"; case DE265_WARNING_SLICEHEADER_INVALID: return "slice header invalid"; case DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING: return "impossible motion vector scaling"; case DE265_WARNING_NONEXISTING_PPS_REFERENCED: return "non-existing PPS referenced"; case DE265_WARNING_NONEXISTING_SPS_REFERENCED: return "non-existing SPS referenced"; case DE265_WARNING_BOTH_PREDFLAGS_ZERO: return "both predFlags[] are zero in MC"; case DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED: return "non-existing reference picture accessed"; case DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ: return "numMV_P != numMV_Q in deblocking"; case DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE: return "number of short-term ref-pic-sets out of range"; case DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE: return "short-term ref-pic-set index out of range"; case DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST: return "faulty reference picture list"; case DE265_WARNING_EOSS_BIT_NOT_SET: return "end_of_sub_stream_one_bit not set to 1 when it should be"; case DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED: return "maximum number of reference pictures exceeded"; case DE265_WARNING_INVALID_CHROMA_FORMAT: return "invalid chroma format in SPS header"; case DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID: return "slice segment address invalid"; case DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO: return "dependent slice with address 0"; case DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM: return "number of threads limited to maximum amount"; case DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER: return "non-existing long-term reference candidate specified in slice header"; case DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY: return "cannot apply SAO because we ran out of memory"; case DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI: return "SPS header missing, cannot decode SEI"; case DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA: return "collocated motion-vector is outside image area"; case DE265_WARNING_PCM_BITDEPTH_TOO_LARGE: return "PCM bit-depth too large"; case DE265_WARNING_REFERENCE_IMAGE_BIT_DEPTH_DOES_NOT_MATCH: return "Bit-depth of reference image does not match current image"; case DE265_WARNING_REFERENCE_IMAGE_SIZE_DOES_NOT_MATCH_SPS: return "Size of reference image does not match current size in SPS"; case DE265_WARNING_CHROMA_OF_CURRENT_IMAGE_DOES_NOT_MATCH_SPS: return "Chroma format of current image does not match chroma in SPS"; case DE265_WARNING_BIT_DEPTH_OF_CURRENT_IMAGE_DOES_NOT_MATCH_SPS: return "Bit-depth of current image does not match SPS"; case DE265_WARNING_REFERENCE_IMAGE_CHROMA_FORMAT_DOES_NOT_MATCH: return "Chroma format of reference image does not match current image"; case DE265_WARNING_INVALID_SLICE_HEADER_INDEX_ACCESS: return "Access with invalid slice header index"; case DE265_WARNING_INVALID_TU_BLOCK_SPLIT: return "Transform block split below minimum transform size"; default: return "unknown error"; } } LIBDE265_API int de265_isOK(de265_error err) { return err == DE265_OK || err >= 1000; } static int de265_init_count; static std::mutex& de265_init_mutex() { static std::mutex de265_init_mutex; return de265_init_mutex; } LIBDE265_API de265_error de265_init() { std::lock_guard lock(de265_init_mutex()); de265_init_count++; if (de265_init_count > 1) { // we are not the first -> already initialized return DE265_OK; } // do initializations init_scan_orders(); if (!alloc_and_init_significant_coeff_ctxIdx_lookupTable()) { de265_init_count--; return DE265_ERROR_LIBRARY_INITIALIZATION_FAILED; } return DE265_OK; } LIBDE265_API de265_error de265_free() { std::lock_guard lock(de265_init_mutex()); if (de265_init_count<=0) { return DE265_ERROR_LIBRARY_NOT_INITIALIZED; } de265_init_count--; if (de265_init_count==0) { free_significant_coeff_ctxIdx_lookupTable(); } return DE265_OK; } LIBDE265_API de265_decoder_context* de265_new_decoder() { de265_error init_err = de265_init(); if (init_err != DE265_OK) { return nullptr; } decoder_context* ctx = new decoder_context; if (!ctx) { de265_free(); return nullptr; } return reinterpret_cast(ctx); } LIBDE265_API de265_error de265_free_decoder(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); ctx->stop_thread_pool(); delete ctx; return de265_free(); } LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context* de265ctx, int number_of_threads) { decoder_context* ctx = reinterpret_cast(de265ctx); if (number_of_threads > MAX_THREADS) { number_of_threads = MAX_THREADS; } if (number_of_threads>0) { de265_error err = ctx->start_thread_pool(number_of_threads); if (de265_isOK(err)) { err = DE265_OK; } return err; } else { return DE265_OK; } } #ifndef LIBDE265_DISABLE_DEPRECATED LIBDE265_API de265_error de265_decode_data(de265_decoder_context* de265ctx, const void* data8, int len) { //decoder_context* ctx = reinterpret_cast(de265ctx); de265_error err; if (len > 0) { err = de265_push_data(de265ctx, data8, len, 0, nullptr); } else { err = de265_flush_data(de265ctx); } if (err != DE265_OK) { return err; } int more = 0; do { err = de265_decode(de265ctx, &more); if (err != DE265_OK) { more = 0; } switch (err) { case DE265_ERROR_WAITING_FOR_INPUT_DATA: // ignore error (didn't exist in 0.4 and before) err = DE265_OK; break; default: break; } } while (more); return err; } #endif #if 0 static void dumpdata(const void* data, int len) { for (int i=0;i(de265ctx); const uint8_t* data = reinterpret_cast(data8); //printf("push data (size %d)\n",len); //dumpdata(data8,16); return ctx->nal_parser.push_data(data,len,pts,user_data); } LIBDE265_API de265_error de265_push_NAL(de265_decoder_context* de265ctx, const void* data8, int len, de265_PTS pts, void* user_data) { decoder_context* ctx = reinterpret_cast(de265ctx); const uint8_t* data = reinterpret_cast(data8); //printf("push NAL (size %d)\n",len); //dumpdata(data8,16); return ctx->nal_parser.push_NAL(data,len,pts,user_data); } LIBDE265_API de265_error de265_decode(de265_decoder_context* de265ctx, int* more) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->decode(more); } LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); ctx->nal_parser.flush_data(); } LIBDE265_API void de265_push_end_of_frame(de265_decoder_context* de265ctx) { de265_push_end_of_NAL(de265ctx); decoder_context* ctx = reinterpret_cast(de265ctx); ctx->nal_parser.mark_end_of_frame(); } LIBDE265_API de265_error de265_flush_data(de265_decoder_context* de265ctx) { de265_push_end_of_NAL(de265ctx); decoder_context* ctx = reinterpret_cast(de265ctx); ctx->nal_parser.flush_data(); ctx->nal_parser.mark_end_of_stream(); return DE265_OK; } LIBDE265_API void de265_reset(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); //printf("--- reset ---\n"); ctx->reset(); } LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context* de265ctx) { const struct de265_image* img = de265_peek_next_picture(de265ctx); if (img) { de265_release_next_picture(de265ctx); } return img; } LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); if (ctx->num_pictures_in_output_queue()>0) { de265_image* img = ctx->get_next_picture_in_output_queue(); return img; } else { return nullptr; } } LIBDE265_API void de265_release_next_picture(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); // no active output picture -> ignore release request if (ctx->num_pictures_in_output_queue()==0) { return; } de265_image* next_image = ctx->get_next_picture_in_output_queue(); loginfo(LogDPB, "release DPB with POC=%d\n",next_image->PicOrderCntVal); next_image->PicOutputFlag = false; // TODO: actually, we want to release it here, but we cannot without breaking API // compatibility, because get_next_picture calls this immediately. Hence, we release // images while scanning for available slots in the DPB. // if (next_image->can_be_released()) { next_image->release(); } // pop output queue ctx->pop_next_picture_in_output_queue(); } LIBDE265_API int de265_get_highest_TID(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->get_highest_TID(); } LIBDE265_API int de265_get_current_TID(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->get_current_TID(); } LIBDE265_API void de265_set_limit_TID(de265_decoder_context* de265ctx,int max_tid) { decoder_context* ctx = reinterpret_cast(de265ctx); ctx->set_limit_TID(max_tid); } LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context* de265ctx,int percent) { decoder_context* ctx = reinterpret_cast(de265ctx); ctx->set_framerate_ratio(percent); } LIBDE265_API int de265_change_framerate(de265_decoder_context* de265ctx,int more) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->change_framerate(more); } LIBDE265_API de265_error de265_get_warning(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->get_warning(); } LIBDE265_API void de265_set_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param, int value) { decoder_context* ctx = reinterpret_cast(de265ctx); switch (param) { case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH: ctx->param_sei_check_hash = !!value; break; case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES: ctx->param_suppress_faulty_pictures = !!value; break; case DE265_DECODER_PARAM_DISABLE_DEBLOCKING: ctx->param_disable_deblocking = !!value; break; case DE265_DECODER_PARAM_DISABLE_SAO: ctx->param_disable_sao = !!value; break; /* case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT: ctx->param_disable_mc_residual_idct = !!value; break; case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT: ctx->param_disable_intra_residual_idct = !!value; break; */ default: assert(false); break; } } LIBDE265_API void de265_set_parameter_int(de265_decoder_context* de265ctx, enum de265_param param, int value) { decoder_context* ctx = reinterpret_cast(de265ctx); switch (param) { case DE265_DECODER_PARAM_DUMP_SPS_HEADERS: ctx->param_sps_headers_fd = value; break; case DE265_DECODER_PARAM_DUMP_VPS_HEADERS: ctx->param_vps_headers_fd = value; break; case DE265_DECODER_PARAM_DUMP_PPS_HEADERS: ctx->param_pps_headers_fd = value; break; case DE265_DECODER_PARAM_DUMP_SLICE_HEADERS: ctx->param_slice_headers_fd = value; break; case DE265_DECODER_PARAM_ACCELERATION_CODE: ctx->set_acceleration_functions(static_cast(value)); break; default: assert(false); break; } } LIBDE265_API int de265_get_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param) { decoder_context* ctx = reinterpret_cast(de265ctx); switch (param) { case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH: return ctx->param_sei_check_hash; case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES: return ctx->param_suppress_faulty_pictures; case DE265_DECODER_PARAM_DISABLE_DEBLOCKING: return ctx->param_disable_deblocking; case DE265_DECODER_PARAM_DISABLE_SAO: return ctx->param_disable_sao; /* case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT: return ctx->param_disable_mc_residual_idct; case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT: return ctx->param_disable_intra_residual_idct; */ default: assert(false); return false; } } LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->nal_parser.bytes_in_input_queue(); } LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context* de265ctx) { decoder_context* ctx = reinterpret_cast(de265ctx); return ctx->nal_parser.number_of_NAL_units_pending(); } LIBDE265_API int de265_get_image_width(const struct de265_image* img,int channel) { switch (channel) { case 0: return img->width_confwin; case 1: case 2: return img->chroma_width_confwin; default: return 0; } } LIBDE265_API int de265_get_image_height(const struct de265_image* img,int channel) { switch (channel) { case 0: return img->height_confwin; case 1: case 2: return img->chroma_height_confwin; default: return 0; } } LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image* img,int channel) { switch (channel) { case 0: return img->get_sps().BitDepth_Y; case 1: case 2: return img->get_sps().BitDepth_C; default: return 0; } } LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image* img) { return img->get_chroma_format(); } LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image* img, int channel, int* stride) { assert(channel>=0 && channel <= 2); uint8_t* data = img->pixels_confwin[channel]; if (stride) *stride = img->get_image_stride(channel) * ((de265_get_bits_per_pixel(img, channel)+7) / 8); return data; } LIBDE265_API void *de265_get_image_plane_user_data(const struct de265_image* img, int channel) { assert(channel>=0 && channel <= 2); return img->plane_user_data[channel]; } LIBDE265_API void de265_set_image_plane(de265_image* img, int cIdx, void* mem, int stride, void *userdata) { // The internal "stride" is the number of pixels per line. stride = stride / ((de265_get_bits_per_pixel(img, cIdx)+7) / 8); img->set_image_plane(cIdx, static_cast(mem), stride, userdata); } LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context* de265ctx, de265_image_allocation* allocfunc, void* userdata) { decoder_context* ctx = reinterpret_cast(de265ctx); ctx->set_image_allocation_functions(allocfunc, userdata); } LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void) { return &de265_image::default_image_allocation; } LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image* img) { return img->pts; } LIBDE265_API void* de265_get_image_user_data(const struct de265_image* img) { return img->user_data; } LIBDE265_API void de265_set_image_user_data(struct de265_image* img, void *user_data) { img->user_data = user_data; } LIBDE265_API void de265_get_image_NAL_header(const struct de265_image* img, int* nal_unit_type, const char** nal_unit_name, int* nuh_layer_id, int* nuh_temporal_id) { if (nal_unit_type) *nal_unit_type = img->nal_hdr.nal_unit_type; if (nal_unit_name) *nal_unit_name = get_NAL_name(img->nal_hdr.nal_unit_type); if (nuh_layer_id) *nuh_layer_id = img->nal_hdr.nuh_layer_id; if (nuh_temporal_id) *nuh_temporal_id = img->nal_hdr.nuh_temporal_id; } LIBDE265_API int de265_get_image_full_range_flag(const struct de265_image* img) { return img->get_sps().vui.video_full_range_flag; } LIBDE265_API int de265_get_image_colour_primaries(const struct de265_image* img) { return img->get_sps().vui.colour_primaries; } LIBDE265_API int de265_get_image_transfer_characteristics(const struct de265_image* img) { return img->get_sps().vui.transfer_characteristics; } LIBDE265_API int de265_get_image_matrix_coefficients(const struct de265_image* img) { return img->get_sps().vui.matrix_coeffs; } } libde265-1.0.18/libde265/de265.h000066400000000000000000000436161515675107500154550ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_H #define DE265_H #ifdef __cplusplus extern "C" { #endif #include //#define inline static __inline #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 #endif #include #if (defined(_WIN32) || defined(__CYGWIN__)) && !defined(LIBDE265_STATIC_BUILD) #ifdef LIBDE265_EXPORTS #define LIBDE265_API __declspec(dllexport) #else #define LIBDE265_API __declspec(dllimport) #endif #elif HAVE_VISIBILITY #ifdef LIBDE265_EXPORTS #define LIBDE265_API __attribute__((__visibility__("default"))) #else #define LIBDE265_API #endif #else #define LIBDE265_API #endif #if __GNUC__ #define LIBDE265_DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) #define LIBDE265_DEPRECATED __declspec(deprecated) #else #define LIBDE265_DEPRECATED #endif #if defined(_MSC_VER) #define LIBDE265_INLINE __inline #else #define LIBDE265_INLINE inline #endif /* === version numbers === */ // version of linked libde265 library LIBDE265_API const char *de265_get_version(void); // returns the version number as a BCD number. // 0xAABBCCDD is interpreted as version AA.BB.CC. // For example: 0x02143000 is version 2.14.30 LIBDE265_API uint32_t de265_get_version_number(void); LIBDE265_API int de265_get_version_number_major(void); LIBDE265_API int de265_get_version_number_minor(void); LIBDE265_API int de265_get_version_number_maintenance(void); /* === error codes === */ typedef enum { DE265_OK = 0, DE265_ERROR_NO_SUCH_FILE=1, //DE265_ERROR_NO_STARTCODE=2, obsolet //DE265_ERROR_EOF=3, DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS=4, DE265_ERROR_CHECKSUM_MISMATCH=5, DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA=6, DE265_ERROR_OUT_OF_MEMORY=7, DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE=8, DE265_ERROR_IMAGE_BUFFER_FULL=9, DE265_ERROR_CANNOT_START_THREADPOOL=10, DE265_ERROR_LIBRARY_INITIALIZATION_FAILED=11, DE265_ERROR_LIBRARY_NOT_INITIALIZED=12, DE265_ERROR_WAITING_FOR_INPUT_DATA=13, DE265_ERROR_CANNOT_PROCESS_SEI=14, DE265_ERROR_PARAMETER_PARSING=15, DE265_ERROR_NO_INITIAL_SLICE_HEADER=16, DE265_ERROR_PREMATURE_END_OF_SLICE=17, DE265_ERROR_UNSPECIFIED_DECODING_ERROR=18, // --- errors that should become obsolete in later libde265 versions --- //DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED = 500, obsolet //DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED = 501, obsolet DE265_ERROR_NOT_IMPLEMENTED_YET = 502, //DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED = 502, obsolet // --- warnings --- DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING = 1000, DE265_WARNING_WARNING_BUFFER_FULL=1001, DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT=1002, DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET=1003, DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA=1004, DE265_WARNING_SPS_HEADER_INVALID=1005, DE265_WARNING_PPS_HEADER_INVALID=1006, DE265_WARNING_SLICEHEADER_INVALID=1007, DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING=1008, DE265_WARNING_NONEXISTING_PPS_REFERENCED=1009, DE265_WARNING_NONEXISTING_SPS_REFERENCED=1010, DE265_WARNING_BOTH_PREDFLAGS_ZERO=1011, DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED=1012, DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ=1013, DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE=1014, DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE=1015, DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST=1016, DE265_WARNING_EOSS_BIT_NOT_SET=1017, DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED=1018, DE265_WARNING_INVALID_CHROMA_FORMAT=1019, DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID=1020, DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO=1021, DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM=1022, DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER=1023, DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY=1024, DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI=1025, DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA=1026, DE265_WARNING_PCM_BITDEPTH_TOO_LARGE=1027, DE265_WARNING_REFERENCE_IMAGE_BIT_DEPTH_DOES_NOT_MATCH=1028, DE265_WARNING_REFERENCE_IMAGE_SIZE_DOES_NOT_MATCH_SPS=1029, DE265_WARNING_CHROMA_OF_CURRENT_IMAGE_DOES_NOT_MATCH_SPS=1030, DE265_WARNING_BIT_DEPTH_OF_CURRENT_IMAGE_DOES_NOT_MATCH_SPS=1031, DE265_WARNING_REFERENCE_IMAGE_CHROMA_FORMAT_DOES_NOT_MATCH=1032, DE265_WARNING_INVALID_SLICE_HEADER_INDEX_ACCESS=1033, DE265_WARNING_INVALID_TU_BLOCK_SPLIT=1034 } de265_error; LIBDE265_API const char* de265_get_error_text(de265_error err); /* Returns true, if 'err' is DE265_OK or a warning. */ LIBDE265_API int de265_isOK(de265_error err); LIBDE265_API void de265_disable_logging(); // DEPRECATED LIBDE265_API void de265_set_verbosity(int level); /* === image === */ /* The image is currently always 3-channel YCbCr, with 4:2:0 chroma. But you may want to check the chroma format anyway for future compatibility. */ typedef struct de265_image de265_image; typedef enum de265_chroma { de265_chroma_mono=0, de265_chroma_420=1, de265_chroma_422=2, de265_chroma_444=3 } de265_chroma; typedef int64_t de265_PTS; LIBDE265_API int de265_get_image_width(const de265_image*,int channel); LIBDE265_API int de265_get_image_height(const de265_image*,int channel); LIBDE265_API de265_chroma de265_get_chroma_format(const de265_image*); LIBDE265_API int de265_get_bits_per_pixel(const de265_image*,int channel); /* The |out_stride| is returned as "bytes per line" if a non-NULL parameter is given. */ LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image*, int channel, int* out_stride); LIBDE265_API void* de265_get_image_plane_user_data(const de265_image*, int channel); LIBDE265_API de265_PTS de265_get_image_PTS(const de265_image*); LIBDE265_API void* de265_get_image_user_data(const de265_image*); LIBDE265_API void de265_set_image_user_data(de265_image*, void *user_data); /* Get NAL-header information of this frame. You can pass in NULL pointers if you do not need this piece of information. */ LIBDE265_API void de265_get_image_NAL_header(const de265_image*, int* nal_unit_type, const char** nal_unit_name, // textual description of 'nal_unit_type' int* nuh_layer_id, int* nuh_temporal_id); LIBDE265_API int de265_get_image_full_range_flag(const de265_image*); LIBDE265_API int de265_get_image_colour_primaries(const de265_image*); LIBDE265_API int de265_get_image_transfer_characteristics(const de265_image*); LIBDE265_API int de265_get_image_matrix_coefficients(const de265_image*); /* === decoder === */ typedef void de265_decoder_context; // private structure /* Get a new decoder context. Must be freed with de265_free_decoder(). */ LIBDE265_API de265_decoder_context* de265_new_decoder(void); /* Initialize background decoding threads. If this function is not called, all decoding is done in the main thread (no multi-threading). */ LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context*, int number_of_threads); /* Free decoder context. May only be called once on a context. */ LIBDE265_API de265_error de265_free_decoder(de265_decoder_context*); #ifndef LIBDE265_DISABLE_DEPRECATED /* Push more data into the decoder, must be raw h265. All complete images in the data will be decoded, hence, do not push too much data at once to prevent image buffer overflows. The end of a picture can only be detected when the succeeding start-code is read from the data. If you want to flush the data and force decoding of the data so far (e.g. at the end of a file), call de265_decode_data() with 'length' zero. NOTE: This method is deprecated and will be removed in a future version. You should use "de265_push_data" or "de265_push_NAL" and "de265_decode" instead. */ LIBDE265_API LIBDE265_DEPRECATED de265_error de265_decode_data(de265_decoder_context*, const void* data, int length); #endif /* Push more data into the decoder, must be a raw h265 bytestream with startcodes. The PTS is assigned to all NALs whose start-code 0x000001 is contained in the data. The bytestream must contain all stuffing-bytes. This function only pushes data into the decoder, nothing will be decoded. */ LIBDE265_API de265_error de265_push_data(de265_decoder_context*, const void* data, int length, de265_PTS pts, void* user_data); /* Indicate that de265_push_data has just received data until the end of a NAL. The remaining pending input data is put into a NAL package and forwarded to the decoder. */ LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context*); /* Indicate that de265_push_data has just received data until the end of a frame. All data pending at the decoder input will be pushed into the decoder and the decoded picture is pushed to the output queue. */ LIBDE265_API void de265_push_end_of_frame(de265_decoder_context*); /* Push a complete NAL unit without startcode into the decoder. The data must still contain all stuffing-bytes. This function only pushes data into the decoder, nothing will be decoded. */ LIBDE265_API de265_error de265_push_NAL(de265_decoder_context*, const void* data, int length, de265_PTS pts, void* user_data); /* Indicate the end-of-stream. All data pending at the decoder input will be pushed into the decoder and the decoded picture queue will be completely emptied. */ LIBDE265_API de265_error de265_flush_data(de265_decoder_context*); /* Return number of bytes pending at the decoder input. Can be used to avoid overflowing the decoder with too much data. */ LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context*); /* Return number of NAL units pending at the decoder input. Can be used to avoid overflowing the decoder with too much data. */ LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context*); /* Do some decoding. Returns status whether it did perform some decoding or why it could not do so. If 'more' is non-null, indicates whether de265_decode() should be called again (possibly after resolving the indicated problem). DE265_OK - decoding ok DE265_ERROR_IMAGE_BUFFER_FULL - DPB full, extract some images before continuing DE265_ERROR_WAITING_FOR_INPUT_DATA - insert more data before continuing You have to consider these cases: - decoding successful -> err = DE265_OK, more=true - decoding stalled -> err != DE265_OK, more=true - decoding finished -> err = DE265_OK, more=false - unresolvable error -> err != DE265_OK, more=false */ LIBDE265_API de265_error de265_decode(de265_decoder_context*, int* more); /* Clear decoder state. Call this when skipping in the stream. */ LIBDE265_API void de265_reset(de265_decoder_context*); /* Return next decoded picture, if there is any. If no complete picture has been decoded yet, NULL is returned. You should call de265_release_next_picture() to advance to the next picture. */ LIBDE265_API const de265_image* de265_peek_next_picture(de265_decoder_context*); // may return NULL /* Get next decoded picture and remove this picture from the decoder output queue. Returns NULL is there is no decoded picture ready. You can use the picture only until you call any other de265_* function. */ LIBDE265_API const de265_image* de265_get_next_picture(de265_decoder_context*); // may return NULL /* Release the current decoded picture for reuse in the decoder. You should not use the data anymore after calling this function. */ LIBDE265_API void de265_release_next_picture(de265_decoder_context*); LIBDE265_API de265_error de265_get_warning(de265_decoder_context*); typedef enum de265_image_format { de265_image_format_mono8 = 1, de265_image_format_YUV420P8 = 2, de265_image_format_YUV422P8 = 3, de265_image_format_YUV444P8 = 4 } de265_image_format; typedef struct de265_image_spec { de265_image_format format; int width; int height; int alignment; // conformance window int crop_left; int crop_right; int crop_top; int crop_bottom; int visible_width; // convenience, width - crop_left - crop_right int visible_height; // convenience, height - crop_top - crop_bottom } de265_image_spec; typedef struct de265_image_allocation { int (*get_buffer)(de265_decoder_context* ctx, // first parameter deprecated de265_image_spec* spec, de265_image* img, void* userdata); void (*release_buffer)(de265_decoder_context* ctx, // first parameter deprecated de265_image* img, void* userdata); } de265_image_allocation; /* The user data pointer will be given to the get_buffer() and release_buffer() functions in de265_image_allocation. */ LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context*, de265_image_allocation*, void* userdata); LIBDE265_API const de265_image_allocation *de265_get_default_image_allocation_functions(void); LIBDE265_API void de265_set_image_plane(de265_image* img, int cIdx, void* mem, int stride, void *userdata); /* --- frame dropping API --- To limit decoding to a maximum temporal layer (TID), use de265_set_limit_TID(). The maximum layer ID in the stream can be queried with de265_get_highest_TID(). Note that the maximum layer ID can change throughout the stream. For a fine-grained selection of the frame-rate, use de265_set_framerate_ratio(). A percentage of 100% will decode all frames in all temporal layers. A lower percentage will drop approximately as many frames. Note that this only accurate if the frames are distributed evenly among the layers. Otherwise, the mapping is non-linear. The limit_TID has a higher precedence than framerate_ratio. Hence, setting a higher framerate-ratio will decode at limit_TID without dropping. With change_framerate(), the output frame-rate can be increased/decreased to some discrete preferable values. Currently, these are non-dropped decoding at various TID layers. */ LIBDE265_API int de265_get_highest_TID(de265_decoder_context*); // highest temporal substream to decode LIBDE265_API int de265_get_current_TID(de265_decoder_context*); // currently decoded temporal substream LIBDE265_API void de265_set_limit_TID(de265_decoder_context*,int max_tid); // highest temporal substream to decode LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context*,int percent); // percentage of frames to decode (approx) LIBDE265_API int de265_change_framerate(de265_decoder_context*,int more_vs_less); // 1: more, -1: less, returns corresponding framerate_ratio /* --- decoding parameters --- */ typedef enum de265_param { DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH=0, // (bool) Perform SEI hash check on decoded pictures. DE265_DECODER_PARAM_DUMP_SPS_HEADERS=1, // (int) Dump headers to specified file-descriptor. DE265_DECODER_PARAM_DUMP_VPS_HEADERS=2, DE265_DECODER_PARAM_DUMP_PPS_HEADERS=3, DE265_DECODER_PARAM_DUMP_SLICE_HEADERS=4, DE265_DECODER_PARAM_ACCELERATION_CODE=5, // (int) enum de265_acceleration, default: AUTO DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES=6, // (bool) do not output frames with decoding errors, default: no (output all images) DE265_DECODER_PARAM_DISABLE_DEBLOCKING=7, // (bool) disable deblocking DE265_DECODER_PARAM_DISABLE_SAO=8 // (bool) disable SAO filter //DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT=9, // (bool) disable decoding of IDCT residuals in MC blocks //DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT=10 // (bool) disable decoding of IDCT residuals in MC blocks } de265_param; // sorted such that a large ID includes all optimizations from lower IDs typedef enum de265_acceleration { de265_acceleration_SCALAR = 0, // only fallback implementation de265_acceleration_MMX = 10, de265_acceleration_SSE = 20, de265_acceleration_SSE2 = 30, de265_acceleration_SSE4 = 40, de265_acceleration_AVX = 50, // not implemented yet de265_acceleration_AVX2 = 60, // not implemented yet de265_acceleration_ARM = 70, de265_acceleration_NEON = 80, de265_acceleration_AUTO = 10000 } de265_acceleration; /* Set decoding parameters. */ LIBDE265_API void de265_set_parameter_bool(de265_decoder_context*, de265_param param, int value); LIBDE265_API void de265_set_parameter_int(de265_decoder_context*, de265_param param, int value); /* Get decoding parameters. */ LIBDE265_API int de265_get_parameter_bool(de265_decoder_context*, de265_param param); /* --- optional library initialization --- */ /* Static library initialization. Must be paired with de265_free(). Initialization is optional, since it will be done implicitly in de265_new_decoder(). Return value is false if initialization failed. Only call de265_free() when initialization was successful. Multiple calls to 'init' are allowed, but must be matched with an equal number of 'free' calls. */ LIBDE265_API de265_error de265_init(void); /* Free global library data. An implicit free call is made in de265_free_decoder(). Returns false if library was not initialized before, or if 'free' was called more often than 'init'. */ LIBDE265_API de265_error de265_free(void); #ifdef __cplusplus } #endif #endif libde265-1.0.18/libde265/deblock.cc000066400000000000000000001011131515675107500163540ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "deblock.h" #include "util.h" #include "transform.h" #include "de265.h" #include // 8.7.2.1 for both EDGE_HOR and EDGE_VER at the same time void markTransformBlockBoundary(de265_image* img, int x0,int y0, int log2TrafoSize,int trafoDepth, int filterLeftCbEdge, int filterTopCbEdge) { logtrace(LogDeblock,"markTransformBlockBoundary(%d,%d, %d,%d, %d,%d)\n",x0,y0, log2TrafoSize,trafoDepth, filterLeftCbEdge,filterTopCbEdge); int split_transform = img->get_split_transform_flag(x0,y0,trafoDepth); if (split_transform) { int x1 = x0 + ((1<>1); int y1 = y0 + ((1<>1); markTransformBlockBoundary(img,x0,y0,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, filterTopCbEdge); markTransformBlockBoundary(img,x1,y0,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, filterTopCbEdge); markTransformBlockBoundary(img,x0,y1,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, DEBLOCK_FLAG_HORIZ); markTransformBlockBoundary(img,x1,y1,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, DEBLOCK_FLAG_HORIZ); } else { // VER for (int k=0;k<(1<set_deblk_flags(x0,y0+k, filterLeftCbEdge); } // HOR for (int k=0;k<(1<set_deblk_flags(x0+k,y0, filterTopCbEdge); } } } // 8.7.2.2 for both EDGE_HOR and EDGE_VER at the same time void markPredictionBlockBoundary(de265_image* img, int x0,int y0, int log2CbSize, int filterLeftCbEdge, int filterTopCbEdge) { logtrace(LogDeblock,"markPredictionBlockBoundary(%d,%d, %d, %d,%d)\n",x0,y0, log2CbSize, filterLeftCbEdge,filterTopCbEdge); enum PartMode partMode = img->get_PartMode(x0,y0); int cbSize = 1<set_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI); img->set_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ); } break; case PART_Nx2N: for (int k=0;kset_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI); } break; case PART_2NxN: for (int k=0;kset_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ); } break; case PART_nLx2N: for (int k=0;kset_deblk_flags(x0+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI); } break; case PART_nRx2N: for (int k=0;kset_deblk_flags(x0+cbSize2+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI); } break; case PART_2NxnU: for (int k=0;kset_deblk_flags(x0+k,y0+cbSize4, DEBLOCK_PB_EDGE_HORIZ); } break; case PART_2NxnD: for (int k=0;kset_deblk_flags(x0+k,y0+cbSize2+cbSize4, DEBLOCK_PB_EDGE_HORIZ); } break; case PART_2Nx2N: // NOP break; } } bool derive_edgeFlags_CTBRow(de265_image* img, uint16_t ctby) { const seq_parameter_set& sps = img->get_sps(); const pic_parameter_set& pps = img->get_pps(); const int minCbSize = sps.MinCbSizeY; bool deblocking_enabled=false; // whether deblocking is enabled in some part of the image int ctb_mask = (1<> sps.Log2MinCbSizeY; uint16_t cb_y_end = ((ctby+1) << sps.Log2CtbSizeY) >> sps.Log2MinCbSizeY; cb_y_end = std::min(cb_y_end, sps.PicHeightInMinCbsY); for (int cb_y=cb_y_start;cb_yget_sps().PicWidthInMinCbsY;cb_x++) { int log2CbSize = img->get_log2CbSize_cbUnits(cb_x,cb_y); if (log2CbSize==0) { continue; } // we are now at the top corner of a CB int x0 = cb_x * minCbSize; int y0 = cb_y * minCbSize; int x0ctb = x0 >> ctbshift; int y0ctb = y0 >> ctbshift; // check for corrupted streams if (img->is_SliceHeader_available(x0,y0)==false) { return false; } // check whether we should filter this slice slice_segment_header* shdr = img->get_SliceHeader(x0,y0); // check whether to filter left and top edge uint8_t filterLeftCbEdge = DEBLOCK_FLAG_VERTI; uint8_t filterTopCbEdge = DEBLOCK_FLAG_HORIZ; if (x0 == 0) filterLeftCbEdge = 0; if (y0 == 0) filterTopCbEdge = 0; // check for slice and tile boundaries (8.7.2, step 2 in both processes) if (x0 && ((x0 & ctb_mask) == 0)) { // left edge at CTB boundary if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 && img->is_SliceHeader_available(x0-1,y0) && // for corrupted streams shdr->SliceAddrRS != img->get_SliceHeader(x0-1,y0)->SliceAddrRS) { filterLeftCbEdge = 0; } else if (pps.loop_filter_across_tiles_enabled_flag == 0 && pps.TileIdRS[ x0ctb +y0ctb*picWidthInCtbs] != pps.TileIdRS[((x0-1)>>ctbshift)+y0ctb*picWidthInCtbs]) { filterLeftCbEdge = 0; } } if (y0 && ((y0 & ctb_mask) == 0)) { // top edge at CTB boundary if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 && img->is_SliceHeader_available(x0,y0-1) && // for corrupted streams shdr->SliceAddrRS != img->get_SliceHeader(x0,y0-1)->SliceAddrRS) { filterTopCbEdge = 0; } else if (pps.loop_filter_across_tiles_enabled_flag == 0 && pps.TileIdRS[x0ctb+ y0ctb *picWidthInCtbs] != pps.TileIdRS[x0ctb+((y0-1)>>ctbshift)*picWidthInCtbs]) { filterTopCbEdge = 0; } } // mark edges if (shdr->slice_deblocking_filter_disabled_flag==0) { deblocking_enabled=true; markTransformBlockBoundary(img, x0,y0, log2CbSize,0, filterLeftCbEdge, filterTopCbEdge); markPredictionBlockBoundary(img, x0,y0, log2CbSize, filterLeftCbEdge, filterTopCbEdge); } } return deblocking_enabled; } bool derive_edgeFlags(de265_image* img) { bool deblocking_enabled=false; for (int y=0;yget_sps().PicHeightInCtbsY;y++) { deblocking_enabled |= derive_edgeFlags_CTBRow(img,y); } return deblocking_enabled; } // 8.7.2.3 (both, EDGE_VER and EDGE_HOR) void derive_boundaryStrength(de265_image* img, bool vertical, int yStart,int yEnd, int xStart,int xEnd) { int xIncr = vertical ? 2 : 1; int yIncr = vertical ? 1 : 2; int xOffs = vertical ? 1 : 0; int yOffs = vertical ? 0 : 1; int edgeMask = vertical ? (DEBLOCK_FLAG_VERTI | DEBLOCK_PB_EDGE_VERTI) : (DEBLOCK_FLAG_HORIZ | DEBLOCK_PB_EDGE_HORIZ); int transformEdgeMask = vertical ? DEBLOCK_FLAG_VERTI : DEBLOCK_FLAG_HORIZ; xEnd = libde265_min(xEnd,img->get_deblk_width()); yEnd = libde265_min(yEnd,img->get_deblk_height()); //int TUShift = img->get_sps().Log2MinTrafoSize; //int TUStride= img->get_sps().PicWidthInTbsY; for (int y=yStart;yget_deblk_flags(xDi,yDi) & edgeMask) ? "edge" : "..."); uint8_t edgeFlags = img->get_deblk_flags(xDi,yDi); if (edgeFlags & edgeMask) { bool p_is_intra_pred = (img->get_pred_mode(xDi-xOffs, yDi-yOffs) == MODE_INTRA); bool q_is_intra_pred = (img->get_pred_mode(xDi, yDi ) == MODE_INTRA); int bS; if (p_is_intra_pred || q_is_intra_pred) { bS = 2; } else { // opposing site int xDiOpp = xDi-xOffs; int yDiOpp = yDi-yOffs; if ((edgeFlags & transformEdgeMask) && (img->get_nonzero_coefficient(xDi ,yDi) || img->get_nonzero_coefficient(xDiOpp,yDiOpp))) { bS = 1; } else { bS = 0; const PBMotion& mviP = img->get_mv_info(xDiOpp,yDiOpp); const PBMotion& mviQ = img->get_mv_info(xDi ,yDi); slice_segment_header* shdrP = img->get_SliceHeader(xDiOpp,yDiOpp); slice_segment_header* shdrQ = img->get_SliceHeader(xDi ,yDi); if (shdrP && shdrQ) { int refPicP0 = mviP.predFlag[0] ? shdrP->RefPicList[0][ mviP.refIdx[0] ] : -1; int refPicP1 = mviP.predFlag[1] ? shdrP->RefPicList[1][ mviP.refIdx[1] ] : -1; int refPicQ0 = mviQ.predFlag[0] ? shdrQ->RefPicList[0][ mviQ.refIdx[0] ] : -1; int refPicQ1 = mviQ.predFlag[1] ? shdrQ->RefPicList[1][ mviQ.refIdx[1] ] : -1; bool samePics = ((refPicP0==refPicQ0 && refPicP1==refPicQ1) || (refPicP0==refPicQ1 && refPicP1==refPicQ0)); if (!samePics) { bS = 1; } else { MotionVector mvP0 = mviP.mv[0]; if (!mviP.predFlag[0]) { mvP0.x=mvP0.y=0; } MotionVector mvP1 = mviP.mv[1]; if (!mviP.predFlag[1]) { mvP1.x=mvP1.y=0; } MotionVector mvQ0 = mviQ.mv[0]; if (!mviQ.predFlag[0]) { mvQ0.x=mvQ0.y=0; } MotionVector mvQ1 = mviQ.mv[1]; if (!mviQ.predFlag[1]) { mvQ1.x=mvQ1.y=0; } int numMV_P = mviP.predFlag[0] + mviP.predFlag[1]; int numMV_Q = mviQ.predFlag[0] + mviQ.predFlag[1]; if (numMV_P!=numMV_Q) { img->decctx->add_warning(DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ, false); img->integrity = INTEGRITY_DECODING_ERRORS; } // two different reference pictures or only one reference picture if (refPicP0 != refPicP1) { if (refPicP0 == refPicQ0) { if (abs_value(mvP0.x-mvQ0.x) >= 4 || abs_value(mvP0.y-mvQ0.y) >= 4 || abs_value(mvP1.x-mvQ1.x) >= 4 || abs_value(mvP1.y-mvQ1.y) >= 4) { bS = 1; } } else { if (abs_value(mvP0.x-mvQ1.x) >= 4 || abs_value(mvP0.y-mvQ1.y) >= 4 || abs_value(mvP1.x-mvQ0.x) >= 4 || abs_value(mvP1.y-mvQ0.y) >= 4) { bS = 1; } } } else { assert(refPicQ0==refPicQ1); if ((abs_value(mvP0.x-mvQ0.x) >= 4 || abs_value(mvP0.y-mvQ0.y) >= 4 || abs_value(mvP1.x-mvQ1.x) >= 4 || abs_value(mvP1.y-mvQ1.y) >= 4) && (abs_value(mvP0.x-mvQ1.x) >= 4 || abs_value(mvP0.y-mvQ1.y) >= 4 || abs_value(mvP1.x-mvQ0.x) >= 4 || abs_value(mvP1.y-mvQ0.y) >= 4)) { bS = 1; } } } } else { bS = 0; // if shdrP==nullptr or shdrQ==nullptr } /* printf("unimplemented deblocking code for CU at %d;%d\n",xDi,yDi); logerror(LogDeblock, "unimplemented code reached (file %s, line %d)\n", __FILE__, __LINE__); */ } } img->set_deblk_bS(xDi,yDi, bS); } else { img->set_deblk_bS(xDi,yDi, 0); } } } void derive_boundaryStrength_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) { int ctbSize = img->get_sps().CtbSizeY; int deblkSize = ctbSize/4; derive_boundaryStrength(img,vertical, yCtb*deblkSize, (yCtb+1)*deblkSize, xCtb*deblkSize, (xCtb+1)*deblkSize); } static uint8_t table_8_23_beta[52] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36, 38,40,42,44,46,48,50,52,54,56,58,60,62,64 }; static uint8_t table_8_23_tc[54] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 8, 9,10,11,13,14,16,18,20,22,24 }; // 8.7.2.4 template void edge_filtering_luma_internal(de265_image* img, bool vertical, int yStart,int yEnd, int xStart,int xEnd) { //printf("luma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd); const seq_parameter_set& sps = img->get_sps(); int xIncr = vertical ? 2 : 1; int yIncr = vertical ? 1 : 2; const int stride = img->get_image_stride(0); int bitDepth_Y = sps.BitDepth_Y; xEnd = libde265_min(xEnd,img->get_deblk_width()); yEnd = libde265_min(yEnd,img->get_deblk_height()); for (int y=yStart;y pixel resolution int yDi = y<<2; // *4 -> pixel resolution int bS = img->get_deblk_bS(xDi,yDi); //printf("x,y:%d,%d xDi,yDi:%d,%d\n",x,y,xDi,yDi); logtrace(LogDeblock,"deblock POC=%d %c --- x:%d y:%d bS:%d---\n", img->PicOrderCntVal,vertical ? 'V':'H',xDi,yDi,bS); #if 0 { uint8_t* ptr = img->y + stride*yDi + xDi; for (int dy=-4;dy<4;dy++) { for (int dx=-4;dx<4;dx++) { printf("%02x ", ptr[dy*stride + dx]); if (dx==-1) printf("| "); } printf("\n"); if (dy==-1) printf("-------------------------\n"); } } #endif #if 0 if (!vertical) { uint8_t* ptr = img->y + stride*yDi + xDi; for (int dy=-4;dy<4;dy++) { for (int dx=0;dx<4;dx++) { printf("%02x ", ptr[dy*stride + dx]); if (dx==-1) printf("| "); } printf("\n"); if (dy==-1) printf("-------------------------\n"); } } #endif if (bS>0) { // 8.7.2.4.3 pixel_t* ptr = img->get_image_plane_at_pos_NEW(0, xDi,yDi); pixel_t q[4][4], p[4][4]; for (int k=0;k<4;k++) for (int i=0;i<4;i++) { if (vertical) { q[k][i] = ptr[ i +k*stride]; p[k][i] = ptr[-i-1+k*stride]; } else { q[k][i] = ptr[k + i *stride]; p[k][i] = ptr[k -(i+1)*stride]; } } #if 0 for (int k=0;k<4;k++) { for (int i=0;i<4;i++) { printf("%02x ", p[k][3-i]); } printf("| "); for (int i=0;i<4;i++) { printf("%02x ", q[k][i]); } printf("\n"); } #endif int QP_Q = img->get_QPY(xDi,yDi); int QP_P = (vertical ? img->get_QPY(xDi-1,yDi) : img->get_QPY(xDi,yDi-1) ); int qP_L = (QP_Q+QP_P+1)>>1; logtrace(LogDeblock,"QP: %d & %d -> %d\n",QP_Q,QP_P,qP_L); int sliceIndexQ00 = img->get_SliceHeaderIndex(xDi,yDi); int beta_offset = img->slices[sliceIndexQ00]->slice_beta_offset; int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset; int Q_beta = Clip3(0,51, qP_L + beta_offset); int betaPrime = table_8_23_beta[Q_beta]; int beta = betaPrime * (1<<(bitDepth_Y - 8)); int Q_tc = Clip3(0,53, qP_L + 2*(bS-1) + tc_offset); int tcPrime = table_8_23_tc[Q_tc]; int tc = tcPrime * (1<<(bitDepth_Y - 8)); logtrace(LogDeblock,"beta: %d (%d) tc: %d (%d)\n",beta,beta_offset, tc,tc_offset); int dE=0, dEp=0, dEq=0; int dp0 = abs_value(p[0][2] - 2*p[0][1] + p[0][0]); int dp3 = abs_value(p[3][2] - 2*p[3][1] + p[3][0]); int dq0 = abs_value(q[0][2] - 2*q[0][1] + q[0][0]); int dq3 = abs_value(q[3][2] - 2*q[3][1] + q[3][0]); int dpq0 = dp0 + dq0; int dpq3 = dp3 + dq3; int dp = dp0 + dp3; int dq = dq0 + dq3; int d = dpq0 + dpq3; if (d < beta) { //int dpq = 2*dpq0; bool dSam0 = (2 * dpq0 < (beta >> 2) && abs_value(p[0][3]-p[0][0]) + abs_value(q[0][0]-q[0][3]) < (beta >> 3) && abs_value(p[0][0]-q[0][0]) < ((5 * tc + 1) >> 1)); bool dSam3 = (2 * dpq3 < (beta >> 2) && abs_value(p[3][3]-p[3][0]) + abs_value(q[3][0]-q[3][3]) < (beta >> 3) && abs_value(p[3][0]-q[3][0]) < ((5 * tc + 1) >> 1)); if (dSam0 && dSam3) { dE = 2; } else { dE = 1; } if (dp < ((beta + (beta >> 1)) >> 3)) { dEp = 1; } if (dq < ((beta + (beta >> 1)) >> 3)) { dEq = 1; } logtrace(LogDeblock, "dE:%d dEp:%d dEq:%d\n", dE, dEp, dEq); } // 8.7.2.4.4 if (dE != 0) { bool filterP = true; bool filterQ = true; if (vertical) { if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi-1,yDi)) filterP=false; if (img->get_cu_transquant_bypass(xDi-1,yDi)) filterP=false; if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false; if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false; } else { if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi-1)) filterP=false; if (img->get_cu_transquant_bypass(xDi,yDi-1)) filterP=false; if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false; if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false; } for (int k=0;k<4;k++) { //int nDp,nDq; logtrace(LogDeblock,"line:%d\n",k); const pixel_t p0 = p[k][0]; const pixel_t p1 = p[k][1]; const pixel_t p2 = p[k][2]; const pixel_t p3 = p[k][3]; const pixel_t q0 = q[k][0]; const pixel_t q1 = q[k][1]; const pixel_t q2 = q[k][2]; const pixel_t q3 = q[k][3]; if (dE==2) { // strong filtering //nDp=nDq=3; pixel_t pnew[3],qnew[3]; pnew[0] = Clip3(p0-2*tc,p0+2*tc, (p2 + 2*p1 + 2*p0 + 2*q0 + q1 +4)>>3); pnew[1] = Clip3(p1-2*tc,p1+2*tc, (p2 + p1 + p0 + q0+2)>>2); pnew[2] = Clip3(p2-2*tc,p2+2*tc, (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3); qnew[0] = Clip3(q0-2*tc,q0+2*tc, (p1+2*p0+2*q0+2*q1+q2+4)>>3); qnew[1] = Clip3(q1-2*tc,q1+2*tc, (p0+q0+q1+q2+2)>>2); qnew[2] = Clip3(q2-2*tc,q2+2*tc, (p0+q0+q1+3*q2+2*q3+4)>>3); logtrace(LogDeblock,"strong filtering\n"); if (vertical) { for (int i=0;i<3;i++) { if (filterP) { ptr[-i-1+k*stride] = pnew[i]; } if (filterQ) { ptr[ i + k*stride] = qnew[i]; } } // ptr[-1+k*stride] = ptr[ 0+k*stride] = 200; } else { for (int i=0;i<3;i++) { if (filterP) { ptr[ k -(i+1)*stride] = pnew[i]; } if (filterQ) { ptr[ k + i *stride] = qnew[i]; } } } } else { // weak filtering //nDp=nDq=0; int delta = (9*(q0-p0) - 3*(q1-p1) + 8)>>4; logtrace(LogDeblock,"delta=%d, tc=%d\n",delta,tc); if (abs_value(delta) < tc*10) { delta = Clip3(-tc,tc,delta); logtrace(LogDeblock," deblk + %d;%d [%02x->%02x] - %d;%d [%02x->%02x] delta:%d\n", vertical ? xDi-1 : xDi+k, vertical ? yDi+k : yDi-1, p0,Clip_BitDepth(p0+delta, bitDepth_Y), vertical ? xDi : xDi+k, vertical ? yDi+k : yDi, q0,Clip_BitDepth(q0-delta, bitDepth_Y), delta); if (vertical) { if (filterP) { ptr[-0-1+k*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); } if (filterQ) { ptr[ 0 +k*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); } } else { if (filterP) { ptr[ k -1*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); } if (filterQ) { ptr[ k +0*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); } } //ptr[ 0+k*stride] = 200; if (dEp==1 && filterP) { int delta_p = Clip3(-(tc>>1), tc>>1, (((p2+p0+1)>>1)-p1+delta)>>1); logtrace(LogDeblock," deblk dEp %d;%d delta:%d\n", vertical ? xDi-2 : xDi+k, vertical ? yDi+k : yDi-2, delta_p); if (vertical) { ptr[-1-1+k*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); } else { ptr[ k -2*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); } } if (dEq==1 && filterQ) { int delta_q = Clip3(-(tc>>1), tc>>1, (((q2+q0+1)>>1)-q1-delta)>>1); logtrace(LogDeblock," delkb dEq %d;%d delta:%d\n", vertical ? xDi+1 : xDi+k, vertical ? yDi+k : yDi+1, delta_q); if (vertical) { ptr[ 1 +k*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); } else { ptr[ k +1*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); } } //nDp = dEp+1; //nDq = dEq+1; //logtrace(LogDeblock,"weak filtering (%d:%d)\n",nDp,nDq); } } } } } } } void edge_filtering_luma(de265_image* img, bool vertical, int yStart,int yEnd, int xStart,int xEnd) { if (img->high_bit_depth(0)) { edge_filtering_luma_internal(img,vertical,yStart,yEnd,xStart,xEnd); } else { edge_filtering_luma_internal(img,vertical,yStart,yEnd,xStart,xEnd); } } void edge_filtering_luma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) { int ctbSize = img->get_sps().CtbSizeY; int deblkSize = ctbSize/4; edge_filtering_luma(img,vertical, yCtb*deblkSize, (yCtb+1)*deblkSize, xCtb*deblkSize, (xCtb+1)*deblkSize); } // 8.7.2.4 /** ?Start and ?End values in 4-luma pixels resolution. */ template void edge_filtering_chroma_internal(de265_image* img, bool vertical, int yStart,int yEnd, int xStart,int xEnd) { //printf("chroma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd); const seq_parameter_set& sps = img->get_sps(); const int SubWidthC = sps.SubWidthC; const int SubHeightC = sps.SubHeightC; int xIncr = vertical ? 2 : 1; int yIncr = vertical ? 1 : 2; xIncr *= SubWidthC; yIncr *= SubHeightC; const int stride = img->get_image_stride(1); xEnd = libde265_min(xEnd,img->get_deblk_width()); yEnd = libde265_min(yEnd,img->get_deblk_height()); int bitDepth_C = sps.BitDepth_C; for (int y=yStart;yget_deblk_bS(xDi*SubWidthC,yDi*SubHeightC); if (bS>1) { // 8.7.2.4.5 for (int cplane=0;cplane<2;cplane++) { int cQpPicOffset = (cplane==0 ? img->get_pps().pic_cb_qp_offset : img->get_pps().pic_cr_qp_offset); pixel_t* ptr = img->get_image_plane_at_pos_NEW(cplane+1, xDi,yDi); pixel_t p[2][4]; pixel_t q[2][4]; logtrace(LogDeblock,"-%s- %d %d\n",cplane==0 ? "Cb" : "Cr",xDi,yDi); for (int i=0;i<2;i++) for (int k=0;k<4;k++) { if (vertical) { q[i][k] = ptr[ i +k*stride]; p[i][k] = ptr[-i-1+k*stride]; } else { q[i][k] = ptr[k + i *stride]; p[i][k] = ptr[k -(i+1)*stride]; } } #if 0 for (int k=0;k<4;k++) { for (int i=0;i<2;i++) { printf("%02x ", p[1-i][k]); } printf("| "); for (int i=0;i<2;i++) { printf("%02x ", q[i][k]); } printf("\n"); } #endif int QP_Q = img->get_QPY(SubWidthC*xDi,SubHeightC*yDi); int QP_P = (vertical ? img->get_QPY(SubWidthC*xDi-1,SubHeightC*yDi) : img->get_QPY(SubWidthC*xDi,SubHeightC*yDi-1)); int qP_i = ((QP_Q+QP_P+1)>>1) + cQpPicOffset; int QP_C; if (sps.ChromaArrayType == CHROMA_420) { QP_C = table8_22(qP_i); } else { QP_C = libde265_min(qP_i, 51); } //printf("POC=%d\n",ctx->img->PicOrderCntVal); logtrace(LogDeblock,"%d %d: ((%d+%d+1)>>1) + %d = qP_i=%d (QP_C=%d)\n", SubWidthC*xDi,SubHeightC*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C); int sliceIndexQ00 = img->get_SliceHeaderIndex(SubWidthC*xDi,SubHeightC*yDi); int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset; int Q = Clip3(0,53, QP_C + 2*(bS-1) + tc_offset); int tcPrime = table_8_23_tc[Q]; int tc = tcPrime * (1<<(sps.BitDepth_C - 8)); logtrace(LogDeblock,"tc_offset=%d Q=%d tc'=%d tc=%d\n",tc_offset,Q,tcPrime,tc); if (vertical) { bool filterP = true; if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false; if (img->get_cu_transquant_bypass(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false; bool filterQ = true; if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; for (int k=0;k<4;k++) { int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])*4)+p[1][k]-q[1][k]+4)>>3)); // standard says <<2 in eq. (8-356), but the value can also be negative logtrace(LogDeblock,"delta=%d\n",delta); if (filterP) { ptr[-1+k*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); } if (filterQ) { ptr[ 0+k*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); } } } else { bool filterP = true; if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false; if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false; bool filterQ = true; if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; for (int k=0;k<4;k++) { int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])*4)+p[1][k]-q[1][k]+4)>>3)); // standard says <<2, but the value can also be negative if (filterP) { ptr[ k-1*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); } if (filterQ) { ptr[ k+0*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); } } } } } } } void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd, int xStart,int xEnd) { if (img->high_bit_depth(1)) { edge_filtering_chroma_internal(img,vertical,yStart,yEnd,xStart,xEnd); } else { edge_filtering_chroma_internal(img,vertical,yStart,yEnd,xStart,xEnd); } } void edge_filtering_chroma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) { int ctbSize = img->get_sps().CtbSizeY; int deblkSize = ctbSize/4; edge_filtering_chroma(img,vertical, yCtb*deblkSize, (yCtb+1)*deblkSize, xCtb*deblkSize, (xCtb+1)*deblkSize); } class thread_task_deblock_CTBRow : public thread_task { public: struct de265_image* img; int ctb_y; bool vertical; virtual void work(); virtual std::string name() const { char buf[100]; sprintf(buf,"deblock-%d",ctb_y); return buf; } }; void thread_task_deblock_CTBRow::work() { state = Running; img->thread_run(this); int xStart=0; int xEnd = img->get_deblk_width(); int ctbSize = img->get_sps().CtbSizeY; int deblkSize = ctbSize/4; int first = ctb_y * deblkSize; int last = (ctb_y+1) * deblkSize; if (last > img->get_deblk_height()) { last = img->get_deblk_height(); } int finalProgress = CTB_PROGRESS_DEBLK_V; if (!vertical) finalProgress = CTB_PROGRESS_DEBLK_H; int rightCtb = img->get_sps().PicWidthInCtbsY-1; if (vertical) { // pass 1: vertical int CtbRow = std::min(ctb_y+1 , img->get_sps().PicHeightInCtbsY-1); img->wait_for_progress(this, rightCtb,CtbRow, CTB_PROGRESS_PREFILTER); } else { // pass 2: horizontal if (ctb_y>0) { img->wait_for_progress(this, rightCtb,ctb_y-1, CTB_PROGRESS_DEBLK_V); } img->wait_for_progress(this, rightCtb,ctb_y, CTB_PROGRESS_DEBLK_V); if (ctb_y+1get_sps().PicHeightInCtbsY) { img->wait_for_progress(this, rightCtb,ctb_y+1, CTB_PROGRESS_DEBLK_V); } } //printf("deblock %d to %d orientation: %d\n",first,last,vertical); bool deblocking_enabled; // first pass: check edge flags and whether we have to deblock if (vertical) { deblocking_enabled = derive_edgeFlags_CTBRow(img, ctb_y); //for (int x=0;x<=rightCtb;x++) { int x=0; img->set_CtbDeblockFlag(x,ctb_y, deblocking_enabled); //} } else { int x=0; deblocking_enabled=img->get_CtbDeblockFlag(x,ctb_y); } if (deblocking_enabled) { derive_boundaryStrength(img, vertical, first,last, xStart,xEnd); edge_filtering_luma(img, vertical, first,last, xStart,xEnd); if (img->get_sps().ChromaArrayType != CHROMA_MONO) { edge_filtering_chroma(img, vertical, first,last, xStart,xEnd); } } for (int x=0;x<=rightCtb;x++) { const int CtbWidth = img->get_sps().PicWidthInCtbsY; img->ctb_progress[x+ctb_y*CtbWidth].set_progress(finalProgress); } state = Finished; img->thread_finishes(this); } void add_deblocking_tasks(image_unit* imgunit) { de265_image* img = imgunit->img; decoder_context* ctx = img->decctx; int nRows = img->get_sps().PicHeightInCtbsY; img->thread_start(nRows*2); for (int pass=0;pass<2;pass++) { for (int y=0;yget_sps().PicHeightInCtbsY;y++) { thread_task_deblock_CTBRow* task = new thread_task_deblock_CTBRow; task->img = img; task->ctb_y = y; task->vertical = (pass==0); imgunit->tasks.push_back(task); ctx->thread_pool_.add_task(task); } } } void apply_deblocking_filter(de265_image* img) // decoder_context* ctx) { //decoder_context* ctx = img->decctx; char enabled_deblocking = derive_edgeFlags(img); if (enabled_deblocking) { // vertical filtering logtrace(LogDeblock,"VERTICAL\n"); derive_boundaryStrength(img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); edge_filtering_luma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); if (img->get_sps().ChromaArrayType != CHROMA_MONO) { edge_filtering_chroma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); } #if 0 char buf[1000]; sprintf(buf,"lf-after-V-%05d.yuv", ctx->img->PicOrderCntVal); write_picture_to_file(ctx->img, buf); #endif // horizontal filtering logtrace(LogDeblock,"HORIZONTAL\n"); derive_boundaryStrength(img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); edge_filtering_luma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); if (img->get_sps().ChromaArrayType != CHROMA_MONO) { edge_filtering_chroma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); } #if 0 sprintf(buf,"lf-after-H-%05d.yuv", ctx->img->PicOrderCntVal); write_picture_to_file(ctx->img, buf); #endif } } libde265-1.0.18/libde265/deblock.h000066400000000000000000000017541515675107500162300ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_DEBLOCK_H #define DE265_DEBLOCK_H #include "libde265/decctx.h" void add_deblocking_tasks(image_unit* imgunit); void apply_deblocking_filter(de265_image* img); //decoder_context* ctx); #endif libde265-1.0.18/libde265/decctx.cc000066400000000000000000001576211515675107500162420ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "decctx.h" #include "util.h" #include "sao.h" #include "sei.h" #include "deblock.h" #include #include #include #include #include #include #include "fallback.h" #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef HAVE_SSE4_1 #include "x86/sse.h" #endif #ifdef HAVE_ARM #include "arm/arm.h" #endif #define SAVE_INTERMEDIATE_IMAGES 0 #if SAVE_INTERMEDIATE_IMAGES #include "visualize.h" #endif extern void thread_decode_CTB_row(void* d); extern void thread_decode_slice_segment(void* d); thread_context::thread_context() { // There is an interesting issue here. When aligning _coeffBuf to 16 bytes offset with // __attribute__((align(16))), the following statement is optimized away since the // compiler assumes that the pointer would be 16-byte aligned. However, this is not the // case when the structure has been dynamically allocated. In this case, the base can // also be at 8 byte offsets (at least with MingW,32 bit). int offset = ((uintptr_t)_coeffBuf) & 0xf; if (offset == 0) { coeffBuf = _coeffBuf; } else { coeffBuf = (int16_t *) (((uint8_t *)_coeffBuf) + (16-offset)); } memset(coeffBuf, 0, 32*32*sizeof(int16_t)); } slice_unit::slice_unit(decoder_context* decctx) : nal(nullptr), shdr(nullptr), imgunit(nullptr), flush_reorder_buffer(false), nThreads(0), first_decoded_CTB_RS(-1), last_decoded_CTB_RS(-1), thread_contexts(nullptr), ctx(decctx) { state = Unprocessed; nThreadContexts = 0; } slice_unit::~slice_unit() { ctx->nal_parser.free_NAL_unit(nal); if (thread_contexts) { delete[] thread_contexts; } } void slice_unit::allocate_thread_contexts(int n) { assert(thread_contexts==nullptr); thread_contexts = new thread_context[n]; nThreadContexts = n; } image_unit::image_unit() = default; image_unit::~image_unit() { for (size_t i=0;i0) { //flush_thread_pool(&ctx->thread_pool); thread_pool_.stop(); } } void decoder_context::reset() { if (num_worker_threads>0) { //flush_thread_pool(&ctx->thread_pool); thread_pool_.stop(); } // -------------------------------------------------- NumPocStCurrBefore = 0; NumPocStCurrAfter = 0; NumPocStFoll = 0; NumPocLtCurr = 0; NumPocLtFoll = 0; nal_unit_type = 0; IdrPicFlag = 0; RapPicFlag = 0; img = nullptr; // TODO: remove all pending image_units // --- decoded picture buffer --- current_image_poc_lsb = -1; // any invalid number first_decoded_picture = true; // --- remove all pictures from output queue --- // there was a bug the peek_next_image did not return nullptr on empty output queues. // This was (indirectly) fixed by recreating the DPB buffer, but it should actually // be sufficient to clear it like this. // The error showed while scrubbing the ToS video in VLC. dpb.clear(); nal_parser.remove_pending_input_data(); while (!image_units.empty()) { delete image_units.back(); image_units.pop_back(); } // --- start threads again --- if (num_worker_threads>0) { // TODO: need error checking start_thread_pool(num_worker_threads); } } void base_context::set_acceleration_functions(enum de265_acceleration l) { // fill scalar functions first (so that function table is completely filled) init_acceleration_functions_fallback(&acceleration); // override functions with optimized variants #ifdef HAVE_SSE4_1 if (l>=de265_acceleration_SSE) { init_acceleration_functions_sse(&acceleration); } #endif #ifdef HAVE_ARM if (l>=de265_acceleration_ARM) { init_acceleration_functions_arm(&acceleration); } #endif } void decoder_context::init_thread_context(thread_context* tctx) { // zero scrap memory for coefficient blocks memset(tctx->_coeffBuf, 0, sizeof(tctx->_coeffBuf)); // TODO: check if we can safely remove this tctx->currentQG_x = -1; tctx->currentQG_y = -1; // --- find QPY that was active at the end of the previous slice --- // find the previous CTB in TS order const pic_parameter_set& pps = tctx->img->get_pps(); const seq_parameter_set& sps = tctx->img->get_sps(); if (tctx->shdr->slice_segment_address > 0) { int prevCtb = pps.CtbAddrTStoRS[ pps.CtbAddrRStoTS[tctx->shdr->slice_segment_address] -1 ]; int ctbX = prevCtb % sps.PicWidthInCtbsY; int ctbY = prevCtb / sps.PicWidthInCtbsY; // take the pixel at the bottom right corner (but consider that the image size might be smaller) int x = ((ctbX+1) << sps.Log2CtbSizeY)-1; int y = ((ctbY+1) << sps.Log2CtbSizeY)-1; x = std::min(x,sps.pic_width_in_luma_samples-1); y = std::min(y,sps.pic_height_in_luma_samples-1); //printf("READ QPY: %d %d -> %d (should %d)\n",x,y,imgunit->img->get_QPY(x,y), tc.currentQPY); //if (tctx->shdr->dependent_slice_segment_flag) { // TODO: do we need this condition ? tctx->currentQPY = tctx->img->get_QPY(x,y); //} } } void decoder_context::add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream, int ctbRow) { thread_task_ctb_row* task = new thread_task_ctb_row; task->firstSliceSubstream = firstSliceSubstream; task->tctx = tctx; task->debug_startCtbRow = ctbRow; tctx->task = task; thread_pool_.add_task(task); tctx->imgunit->tasks.push_back(task); } void decoder_context::add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream, int ctbx,int ctby) { thread_task_slice_segment* task = new thread_task_slice_segment; task->firstSliceSubstream = firstSliceSubstream; task->tctx = tctx; task->debug_startCtbX = ctbx; task->debug_startCtbY = ctby; tctx->task = task; thread_pool_.add_task(task); tctx->imgunit->tasks.push_back(task); } de265_error decoder_context::read_vps_NAL(bitreader& reader) { logdebug(LogHeaders,"---> read VPS\n"); std::shared_ptr new_vps = std::make_shared(); de265_error err = new_vps->read(this,&reader); if (err != DE265_OK) { return err; } if (param_vps_headers_fd>=0) { new_vps->dump(param_vps_headers_fd); } vps[ new_vps->video_parameter_set_id ] = new_vps; return DE265_OK; } de265_error decoder_context::read_sps_NAL(bitreader& reader) { logdebug(LogHeaders,"----> read SPS\n"); std::shared_ptr new_sps = std::make_shared(); de265_error err; if ((err=new_sps->read(this, &reader)) != DE265_OK) { return err; } if (param_sps_headers_fd>=0) { new_sps->dump(param_sps_headers_fd); } sps[ new_sps->seq_parameter_set_id ] = new_sps; // Remove the all PPS that referenced the old SPS because parameters may have changed and we do not want to // get the SPS and PPS parameters (e.g. image size) out of sync. for (auto& p : pps) { if (p && p->seq_parameter_set_id == new_sps->seq_parameter_set_id) { p = nullptr; } } return DE265_OK; } de265_error decoder_context::read_pps_NAL(bitreader& reader) { logdebug(LogHeaders,"----> read PPS\n"); std::shared_ptr new_pps = std::make_shared(); bool success = new_pps->read(&reader,this); if (!success) { return DE265_WARNING_PPS_HEADER_INVALID; } if (param_pps_headers_fd>=0) { new_pps->dump(param_pps_headers_fd); } pps[ (int)new_pps->pic_parameter_set_id ] = new_pps; return DE265_OK; } de265_error decoder_context::read_sei_NAL(bitreader& reader, bool suffix) { logdebug(LogHeaders,"----> read SEI\n"); sei_message sei; //push_current_picture_to_output_queue(); de265_error err = DE265_OK; if ((err=read_sei(&reader,&sei, suffix, current_sps.get())) == DE265_OK) { dump_sei(&sei, current_sps.get()); if (image_units.empty()==false && suffix) { image_units.back()->suffix_SEIs.push_back(sei); } } else { add_warning(err, false); } return err; } de265_error decoder_context::read_eos_NAL(bitreader& reader) { FirstAfterEndOfSequenceNAL = true; return DE265_OK; } de265_error decoder_context::read_slice_NAL(bitreader& reader, NAL_unit* nal, nal_header& nal_hdr) { logdebug(LogHeaders,"---> read slice segment header\n"); // --- read slice header --- slice_segment_header* shdr = new slice_segment_header; bool continueDecoding; de265_error err = shdr->read(&reader,this, &continueDecoding); if (!continueDecoding) { if (img) { img->integrity = INTEGRITY_NOT_DECODED; } nal_parser.free_NAL_unit(nal); delete shdr; return err; } if (param_slice_headers_fd>=0) { shdr->dump_slice_segment_header(this, param_slice_headers_fd); } if (process_slice_segment_header(shdr, &err, nal->pts, &nal_hdr, nal->user_data) == false) { if (img!=nullptr) img->integrity = INTEGRITY_NOT_DECODED; nal_parser.free_NAL_unit(nal); delete shdr; return err; } reader.skip_bits(1); // TODO: why? reader.prepare_for_CABAC(); // modify entry_point_offsets uint32_t headerLength = reader.data - nal->data(); for (int i=0;inum_entry_point_offsets;i++) { uint32_t skipped = nal->num_skipped_bytes_before(shdr->entry_point_offset[i], headerLength); if (skipped > shdr->entry_point_offset[i]) { add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); nal_parser.free_NAL_unit(nal); delete shdr; return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } shdr->entry_point_offset[i] -= skipped; } this->img->add_slice_segment_header(shdr); // --- start a new image if this is the first slice --- if (shdr->first_slice_segment_in_pic_flag) { image_unit* imgunit = new image_unit; imgunit->img = this->img; image_units.push_back(imgunit); } // --- add slice to current picture --- if ( ! image_units.empty() ) { slice_unit* sliceunit = new slice_unit(this); sliceunit->nal = nal; sliceunit->shdr = shdr; sliceunit->reader = reader; sliceunit->flush_reorder_buffer = flush_reorder_buffer_at_this_frame; image_units.back()->slice_units.push_back(sliceunit); } else { nal_parser.free_NAL_unit(nal); } bool did_work; err = decode_some(&did_work); return DE265_OK; } template void pop_front(std::vector& vec) { for (size_t i=1;islice_units.empty() ) { image_unit* imgunit = image_units[0]; slice_unit* sliceunit = imgunit->get_next_unprocessed_slice_segment(); if (sliceunit != nullptr) { //pop_front(imgunit->slice_units); if (sliceunit->flush_reorder_buffer) { dpb.flush_reorder_buffer(); } *did_work = true; //err = decode_slice_unit_sequential(imgunit, sliceunit); err = decode_slice_unit_parallel(imgunit, sliceunit); if (err) { return err; } //delete sliceunit; } } // if we decoded all slices of the current image and there will not // be added any more slices to the image, output the image if ( ( image_units.size()>=2 && image_units[0]->all_slice_segments_processed()) || ( image_units.size()>=1 && image_units[0]->all_slice_segments_processed() && nal_parser.number_of_NAL_units_pending()==0 && (nal_parser.is_end_of_stream() || nal_parser.is_end_of_frame()) )) { image_unit* imgunit = image_units[0]; *did_work=true; // mark all CTBs as decoded even if they are not, because faulty input // streams could miss part of the picture // TODO: this will not work when slice decoding is parallel to post-filtering, // so we will have to replace this with keeping track of which CTB should have // been decoded (but aren't because of the input stream being faulty) imgunit->img->mark_all_CTB_progress(CTB_PROGRESS_PREFILTER); // run post-processing filters (deblocking & SAO) if (img->decctx->num_worker_threads) run_postprocessing_filters_parallel(imgunit); else run_postprocessing_filters_sequential(imgunit->img); // process suffix SEIs for (size_t i=0;isuffix_SEIs.size();i++) { const sei_message& sei = imgunit->suffix_SEIs[i]; err = process_sei(&sei, imgunit->img); if (err != DE265_OK) break; } push_picture_to_output_queue(imgunit); // remove just decoded image unit from queue delete imgunit; pop_front(image_units); } return err; } de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit, slice_unit* sliceunit) { de265_error err = DE265_OK; /* printf("decode slice POC=%d addr=%d, img=%p\n", sliceunit->shdr->slice_pic_order_cnt_lsb, sliceunit->shdr->slice_segment_address, imgunit->img); */ remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList); if (sliceunit->shdr->slice_segment_address >= imgunit->img->get_pps().CtbAddrRStoTS.size()) { return DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA; } thread_context tctx; tctx.shdr = sliceunit->shdr; tctx.img = imgunit->img; tctx.decctx = this; tctx.imgunit = imgunit; tctx.sliceunit= sliceunit; tctx.CtbAddrInTS = imgunit->img->get_pps().CtbAddrRStoTS[tctx.shdr->slice_segment_address]; tctx.task = nullptr; init_thread_context(&tctx); if (sliceunit->reader.bytes_remaining <= 0) { return DE265_ERROR_PREMATURE_END_OF_SLICE; } tctx.cabac_decoder.init(sliceunit->reader.data, sliceunit->reader.bytes_remaining); // alloc CABAC-model array if entropy_coding_sync is enabled if (imgunit->img->get_pps().entropy_coding_sync_enabled_flag && sliceunit->shdr->first_slice_segment_in_pic_flag) { imgunit->ctx_models.resize( (img->get_sps().PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH ); } sliceunit->nThreads=1; err=read_slice_segment_data(&tctx); sliceunit->finished_threads.set_progress(1); return err; } void decoder_context::mark_whole_slice_as_processed(image_unit* imgunit, slice_unit* sliceunit, int progress) { //printf("mark whole slice\n"); // mark all CTBs upto the next slice segment as processed slice_unit* nextSegment = imgunit->get_next_slice_segment(sliceunit); if (nextSegment) { /* printf("mark whole slice between %d and %d\n", sliceunit->shdr->slice_segment_address, nextSegment->shdr->slice_segment_address); */ for (uint32_t ctb=sliceunit->shdr->slice_segment_address; ctb < nextSegment->shdr->slice_segment_address; ctb++) { if (ctb >= imgunit->img->number_of_ctbs()) break; imgunit->img->ctb_progress[ctb].set_progress(progress); } } } de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit, slice_unit* sliceunit) { de265_error err = DE265_OK; remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList); /* printf("-------- decode --------\n"); printf("IMAGE UNIT %p\n",imgunit); sliceunit->shdr->dump_slice_segment_header(sliceunit->ctx, 1); imgunit->dump_slices(); */ de265_image* img = imgunit->img; const pic_parameter_set& pps = img->get_pps(); sliceunit->state = slice_unit::InProgress; bool use_WPP = (img->decctx->num_worker_threads > 0 && pps.entropy_coding_sync_enabled_flag); bool use_tiles = (img->decctx->num_worker_threads > 0 && pps.tiles_enabled_flag); // TODO: remove this warning later when we do frame-parallel decoding if (img->decctx->num_worker_threads > 0 && pps.entropy_coding_sync_enabled_flag == false && pps.tiles_enabled_flag == false) { img->decctx->add_warning(DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING, true); } // If this is the first slice segment, mark all CTBs before this as processed // (the real first slice segment could be missing). if (imgunit->is_first_slice_segment(sliceunit)) { slice_segment_header* shdr = sliceunit->shdr; int firstCTB = shdr->slice_segment_address; for (int ctb=0;ctbctb_progress[ctb].set_progress(CTB_PROGRESS_PREFILTER); } } // if there is a previous slice that has been completely decoded, // mark all CTBs until the start of this slice as completed //printf("this slice: %p\n",sliceunit); slice_unit* prevSlice = imgunit->get_prev_slice_segment(sliceunit); //if (prevSlice) printf("prev slice state: %d\n",prevSlice->state); if (prevSlice && prevSlice->state == slice_unit::Decoded) { mark_whole_slice_as_processed(imgunit,prevSlice,CTB_PROGRESS_PREFILTER); } // TODO: even though we cannot split this into several tasks, we should run it // as a background thread if (!use_WPP && !use_tiles) { //printf("SEQ\n"); err = decode_slice_unit_sequential(imgunit, sliceunit); sliceunit->state = slice_unit::Decoded; mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); return err; } if (use_WPP && use_tiles) { // TODO: this is not allowed ... output some warning or error return DE265_WARNING_PPS_HEADER_INVALID; } if (use_WPP) { //printf("WPP\n"); err = decode_slice_unit_WPP(imgunit, sliceunit); sliceunit->state = slice_unit::Decoded; mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); return err; } else if (use_tiles) { //printf("TILE\n"); err = decode_slice_unit_tiles(imgunit, sliceunit); sliceunit->state = slice_unit::Decoded; mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); return err; } assert(false); return err; } de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit, slice_unit* sliceunit) { de265_error err = DE265_OK; de265_image* img = imgunit->img; slice_segment_header* shdr = sliceunit->shdr; const pic_parameter_set& pps = img->get_pps(); int nRows = shdr->num_entry_point_offsets +1; int ctbsWidth = img->get_sps().PicWidthInCtbsY; assert(img->num_threads_active() == 0); // reserve space to store entropy coding context models for each CTB row if (shdr->first_slice_segment_in_pic_flag) { // reserve space for nRows-1 because we don't need to save the CABAC model in the last CTB row imgunit->ctx_models.resize( (img->get_sps().PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH ); } sliceunit->allocate_thread_contexts(nRows); // first CTB in this slice int ctbAddrRS = shdr->slice_segment_address; int ctbRow = ctbAddrRS / ctbsWidth; for (int entryPt=0;entryPt0) { ctbRow++; ctbAddrRS = ctbRow * ctbsWidth; } else if (nRows>1 && (ctbAddrRS % ctbsWidth) != 0) { // If slice segment consists of several WPP rows, each of them // has to start at a row. //printf("does not start at start\n"); err = DE265_WARNING_SLICEHEADER_INVALID; break; } // prepare thread context thread_context* tctx = sliceunit->get_thread_context(entryPt); tctx->shdr = shdr; tctx->decctx = img->decctx; tctx->img = img; tctx->imgunit = imgunit; tctx->sliceunit= sliceunit; tctx->CtbAddrInTS = pps.CtbAddrRStoTS[ctbAddrRS]; init_thread_context(tctx); // init CABAC int dataStartIndex; if (entryPt==0) { dataStartIndex=0; } else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; } int dataEnd; if (entryPt==nRows-1) dataEnd = sliceunit->reader.bytes_remaining; else dataEnd = shdr->entry_point_offset[entryPt]; if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining || dataEnd <= dataStartIndex) { //printf("WPP premature end\n"); err = DE265_ERROR_PREMATURE_END_OF_SLICE; break; } tctx->cabac_decoder.init(&sliceunit->reader.data[dataStartIndex], dataEnd-dataStartIndex); // add task //printf("start task for ctb-row: %d\n",ctbRow); img->thread_start(1); sliceunit->nThreads++; add_task_decode_CTB_row(tctx, entryPt==0, ctbRow); } #if 0 for (;;) { printf("q:%d r:%d b:%d f:%d\n", img->nThreadsQueued, img->nThreadsRunning, img->nThreadsBlocked, img->nThreadsFinished); if (img->debug_is_completed()) break; usleep(1000); } #endif img->wait_for_completion(); for (size_t i=0;itasks.size();i++) delete imgunit->tasks[i]; imgunit->tasks.clear(); return err; } de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit, slice_unit* sliceunit) { de265_error err = DE265_OK; de265_image* img = imgunit->img; slice_segment_header* shdr = sliceunit->shdr; const pic_parameter_set& pps = img->get_pps(); int nTiles = shdr->num_entry_point_offsets +1; int ctbsWidth = img->get_sps().PicWidthInCtbsY; assert(img->num_threads_active() == 0); sliceunit->allocate_thread_contexts(nTiles); // first CTB in this slice int ctbAddrRS = shdr->slice_segment_address; int tileID = pps.TileIdRS[ctbAddrRS]; for (int entryPt=0;entryPt0) { tileID++; if (tileID >= pps.num_tile_columns * pps.num_tile_rows) { err = DE265_WARNING_SLICEHEADER_INVALID; break; } int ctbX = pps.colBd[tileID % pps.num_tile_columns]; int ctbY = pps.rowBd[tileID / pps.num_tile_columns]; ctbAddrRS = ctbY * ctbsWidth + ctbX; } // set thread context thread_context* tctx = sliceunit->get_thread_context(entryPt); tctx->shdr = shdr; tctx->decctx = img->decctx; tctx->img = img; tctx->imgunit = imgunit; tctx->sliceunit= sliceunit; tctx->CtbAddrInTS = pps.CtbAddrRStoTS[ctbAddrRS]; init_thread_context(tctx); // init CABAC int dataStartIndex; if (entryPt==0) { dataStartIndex=0; } else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; } int dataEnd; if (entryPt==nTiles-1) dataEnd = sliceunit->reader.bytes_remaining; else dataEnd = shdr->entry_point_offset[entryPt]; if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining || dataEnd <= dataStartIndex) { err = DE265_ERROR_PREMATURE_END_OF_SLICE; break; } tctx->cabac_decoder.init(&sliceunit->reader.data[dataStartIndex], dataEnd-dataStartIndex); // add task //printf("add tiles thread\n"); img->thread_start(1); sliceunit->nThreads++; add_task_decode_slice_segment(tctx, entryPt==0, ctbAddrRS % ctbsWidth, ctbAddrRS / ctbsWidth); } img->wait_for_completion(); for (size_t i=0;itasks.size();i++) delete imgunit->tasks[i]; imgunit->tasks.clear(); return err; } de265_error decoder_context::decode_NAL(NAL_unit* nal) { //return decode_NAL_OLD(nal); decoder_context* ctx = this; de265_error err = DE265_OK; bitreader reader(nal->data(), nal->size()); nal_header nal_hdr; err = nal_hdr.read(&reader); if (err != DE265_OK) { nal_parser.free_NAL_unit(nal); return err; } ctx->process_nal_hdr(&nal_hdr); if (nal_hdr.nuh_layer_id > 0) { // Discard all NAL units with nuh_layer_id > 0 // These will have to be handled by an SHVC decoder. nal_parser.free_NAL_unit(nal); return DE265_OK; } loginfo(LogHighlevel,"NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", nal->data()[0], nal->data()[1], get_NAL_name(nal_hdr.nal_unit_type), nal_hdr.nuh_temporal_id); /* printf("NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", nal->data()[0], nal->data()[1], get_NAL_name(nal_hdr.nal_unit_type), nal_hdr.nuh_temporal_id); */ // throw away NALs from higher TIDs than currently selected // TODO: better online switching of HighestTID //printf("hTid: %d\n", current_HighestTid); if (nal_hdr.nuh_temporal_id > current_HighestTid) { nal_parser.free_NAL_unit(nal); return DE265_OK; } if (nal_hdr.nal_unit_type<32) { err = read_slice_NAL(reader, nal, nal_hdr); } else switch (nal_hdr.nal_unit_type) { case NAL_UNIT_VPS_NUT: err = read_vps_NAL(reader); nal_parser.free_NAL_unit(nal); break; case NAL_UNIT_SPS_NUT: err = read_sps_NAL(reader); nal_parser.free_NAL_unit(nal); break; case NAL_UNIT_PPS_NUT: err = read_pps_NAL(reader); nal_parser.free_NAL_unit(nal); break; case NAL_UNIT_PREFIX_SEI_NUT: case NAL_UNIT_SUFFIX_SEI_NUT: err = read_sei_NAL(reader, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT); nal_parser.free_NAL_unit(nal); break; case NAL_UNIT_EOS_NUT: ctx->FirstAfterEndOfSequenceNAL = true; nal_parser.free_NAL_unit(nal); break; default: nal_parser.free_NAL_unit(nal); break; } return err; } de265_error decoder_context::decode(int* more) { decoder_context* ctx = this; // if the stream has ended, and no more NALs are to be decoded, flush all pictures if (ctx->nal_parser.get_NAL_queue_length() == 0 && (ctx->nal_parser.is_end_of_stream() || ctx->nal_parser.is_end_of_frame()) && ctx->image_units.empty()) { // flush all pending pictures into output queue // ctx->push_current_picture_to_output_queue(); // TODO: not with new queue ctx->dpb.flush_reorder_buffer(); if (more) { *more = ctx->dpb.num_pictures_in_output_queue(); } return DE265_OK; } // if NAL-queue is empty, we need more data // -> input stalled if (ctx->nal_parser.is_end_of_stream() == false && ctx->nal_parser.is_end_of_frame() == false && ctx->nal_parser.get_NAL_queue_length() == 0) { if (more) { *more=1; } return DE265_ERROR_WAITING_FOR_INPUT_DATA; } // when there are no free image buffers in the DPB, pause decoding // -> output stalled if (!ctx->dpb.has_free_dpb_picture(false)) { if (more) *more = 1; return DE265_ERROR_IMAGE_BUFFER_FULL; } // decode one NAL from the queue de265_error err = DE265_OK; bool did_work = false; if (ctx->nal_parser.get_NAL_queue_length()) { // number_of_NAL_units_pending()) { NAL_unit* nal = ctx->nal_parser.pop_from_NAL_queue(); assert(nal); err = ctx->decode_NAL(nal); // ctx->nal_parser.free_NAL_unit(nal); TODO: do not free NAL with new loop did_work=true; } else if (ctx->nal_parser.is_end_of_frame() == true && ctx->image_units.empty()) { if (more) { *more=1; } return DE265_ERROR_WAITING_FOR_INPUT_DATA; } else { err = decode_some(&did_work); } if (more) { // decoding error is assumed to be unrecoverable *more = (err==DE265_OK && did_work); } return err; } void decoder_context::process_nal_hdr(nal_header* nal) { nal_unit_type = nal->nal_unit_type; IdrPicFlag = isIdrPic(nal->nal_unit_type); RapPicFlag = isRapPic(nal->nal_unit_type); } /* 8.3.1 */ void decoder_context::process_picture_order_count(slice_segment_header* hdr) { loginfo(LogHeaders,"POC computation. lsb:%d prev.pic.lsb:%d msb:%d\n", hdr->slice_pic_order_cnt_lsb, prevPicOrderCntLsb, PicOrderCntMsb); if (isIRAP(nal_unit_type) && NoRaslOutputFlag) { PicOrderCntMsb=0; // flush all images from reorder buffer flush_reorder_buffer_at_this_frame = true; //ctx->dpb.flush_reorder_buffer(); } else { int MaxPicOrderCntLsb = current_sps->MaxPicOrderCntLsb; if ((hdr->slice_pic_order_cnt_lsb < prevPicOrderCntLsb) && (prevPicOrderCntLsb - hdr->slice_pic_order_cnt_lsb) >= MaxPicOrderCntLsb/2) { PicOrderCntMsb = prevPicOrderCntMsb + MaxPicOrderCntLsb; } else if ((hdr->slice_pic_order_cnt_lsb > prevPicOrderCntLsb) && (hdr->slice_pic_order_cnt_lsb - prevPicOrderCntLsb) > MaxPicOrderCntLsb/2) { PicOrderCntMsb = prevPicOrderCntMsb - MaxPicOrderCntLsb; } else { PicOrderCntMsb = prevPicOrderCntMsb; } } img->PicOrderCntVal = PicOrderCntMsb + hdr->slice_pic_order_cnt_lsb; img->picture_order_cnt_lsb = hdr->slice_pic_order_cnt_lsb; loginfo(LogHeaders,"POC computation. new msb:%d POC=%d\n", PicOrderCntMsb, img->PicOrderCntVal); if (img->nal_hdr.nuh_temporal_id==0 && !isSublayerNonReference(nal_unit_type) && !isRASL(nal_unit_type) && !isRADL(nal_unit_type)) { loginfo(LogHeaders,"set prevPicOrderCntLsb/Msb\n"); prevPicOrderCntLsb = hdr->slice_pic_order_cnt_lsb; prevPicOrderCntMsb = PicOrderCntMsb; } } /* 8.3.3.2 Returns DPB index of the generated picture. */ int decoder_context::generate_unavailable_reference_picture(const seq_parameter_set* sps, int POC, bool longTerm) { assert(dpb.has_free_dpb_picture(true)); std::shared_ptr current_sps = this->sps[ (int)current_pps->seq_parameter_set_id ]; int idx = dpb.new_image(current_sps, this, 0,0, false); if (idx<0) { return idx; } de265_image* img = dpb.get_image(idx); img->fill_image(1<<(sps->BitDepth_Y-1), 1<<(sps->BitDepth_C-1), 1<<(sps->BitDepth_C-1)); img->fill_pred_mode(MODE_INTRA); img->PicOrderCntVal = POC; img->picture_order_cnt_lsb = POC & (sps->MaxPicOrderCntLsb-1); img->PicOutputFlag = false; img->PicState = (longTerm ? UsedForLongTermReference : UsedForShortTermReference); img->integrity = INTEGRITY_UNAVAILABLE_REFERENCE; return idx; } /* 8.3.2 invoked once per picture This function will mark pictures in the DPB as 'unused' or 'used for long-term reference' */ de265_error decoder_context::process_reference_picture_set(slice_segment_header* hdr) { std::vector removeReferencesList; const uint32_t currentID = img->get_ID(); if (isIRAP(nal_unit_type) && NoRaslOutputFlag) { int currentPOC = img->PicOrderCntVal; // reset DPB /* The standard says: "When the current picture is an IRAP picture with NoRaslOutputFlag equal to 1, all reference pictures currently in the DPB (if any) are marked as "unused for reference". This seems to be wrong as it also throws out the first CRA picture in a stream like RAP_A (decoding order: CRA,POC=64, RASL,POC=60). Removing only the pictures with lower POCs seems to be compliant to the reference decoder. */ for (size_t i=0;iPicState != UnusedForReference && img->PicOrderCntVal < currentPOC && img->removed_at_picture_id > img->get_ID()) { removeReferencesList.push_back(img->get_ID()); img->removed_at_picture_id = img->get_ID(); //printf("will remove ID %d (a)\n",img->get_ID()); } } } if (isIDR(nal_unit_type)) { // clear all reference pictures NumPocStCurrBefore = 0; NumPocStCurrAfter = 0; NumPocStFoll = 0; NumPocLtCurr = 0; NumPocLtFoll = 0; } else { const ref_pic_set* rps = &hdr->CurrRps; // (8-98) int i,j,k; // scan ref-pic-set for smaller POCs and fill into PocStCurrBefore / PocStFoll for (i=0, j=0, k=0; iNumNegativePics; i++) { if (rps->UsedByCurrPicS0[i]) { PocStCurrBefore[j++] = img->PicOrderCntVal + rps->DeltaPocS0[i]; //printf("PocStCurrBefore = %d\n",PocStCurrBefore[j-1]); } else { PocStFoll[k++] = img->PicOrderCntVal + rps->DeltaPocS0[i]; } } NumPocStCurrBefore = j; // scan ref-pic-set for larger POCs and fill into PocStCurrAfter / PocStFoll for (i=0, j=0; iNumPositivePics; i++) { if (rps->UsedByCurrPicS1[i]) { PocStCurrAfter[j++] = img->PicOrderCntVal + rps->DeltaPocS1[i]; //printf("PocStCurrAfter = %d\n",PocStCurrAfter[j-1]); } else { PocStFoll[k++] = img->PicOrderCntVal + rps->DeltaPocS1[i]; } } NumPocStCurrAfter = j; NumPocStFoll = k; // find used / future long-term references for (i=0, j=0, k=0; //inum_long_term_ref_pics_sps + hdr->num_long_term_pics; inum_long_term_sps + hdr->num_long_term_pics; i++) { int pocLt = PocLsbLt[i]; if (hdr->delta_poc_msb_present_flag[i]) { int currentPictureMSB = img->PicOrderCntVal - hdr->slice_pic_order_cnt_lsb; if (DeltaPocMsbCycleLt[i] > static_cast(INT32_MAX) / current_sps->MaxPicOrderCntLsb) { add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } pocLt += currentPictureMSB - static_cast(DeltaPocMsbCycleLt[i] * current_sps->MaxPicOrderCntLsb); } if (UsedByCurrPicLt[i]) { PocLtCurr[j] = pocLt; CurrDeltaPocMsbPresentFlag[j] = hdr->delta_poc_msb_present_flag[i]; j++; } else { PocLtFoll[k] = pocLt; FollDeltaPocMsbPresentFlag[k] = hdr->delta_poc_msb_present_flag[i]; k++; } } NumPocLtCurr = j; NumPocLtFoll = k; } // (old 8-99) / (new 8-106) // 1. std::vector picInAnyList(dpb.size(), false); dpb.log_dpb_content(); for (int i=0;i=0) picInAnyList[k]=true; else { // TODO, CHECK: is it ok that we generate a picture with POC = LSB (PocLtCurr) // We do not know the correct MSB int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), PocLtCurr[i], true); if (concealedPicture<0) { return (de265_error)(-concealedPicture); } picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture RefPicSetLtCurr[i] = k = concealedPicture; picInAnyList[concealedPicture]=true; } if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; } } for (int i=0;i=0) picInAnyList[k]=true; else { int concealedPicture = k = generate_unavailable_reference_picture(current_sps.get(), PocLtFoll[i], true); if (concealedPicture<0) { return (de265_error)(-concealedPicture); } picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture RefPicSetLtFoll[i] = concealedPicture; picInAnyList[concealedPicture]=true; } } // 2. Mark all pictures in RefPicSetLtCurr / RefPicSetLtFoll as UsedForLongTermReference for (int i=0;iPicState = UsedForLongTermReference; } for (int i=0;iPicState = UsedForLongTermReference; } // 3. for (int i=0;i idx=%d\n",PocStCurrBefore[i], k); RefPicSetStCurrBefore[i] = k; // -1 == "no reference picture" if (k>=0) picInAnyList[k]=true; else { int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), PocStCurrBefore[i], false); if (concealedPicture<0) { return (de265_error)(-concealedPicture); } RefPicSetStCurrBefore[i] = k = concealedPicture; picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture picInAnyList[concealedPicture] = true; //printf(" concealed: %d\n", concealedPicture); } if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; } } for (int i=0;i idx=%d\n",PocStCurrAfter[i], k); RefPicSetStCurrAfter[i] = k; // -1 == "no reference picture" if (k>=0) picInAnyList[k]=true; else { int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), PocStCurrAfter[i], false); if (concealedPicture<0) { return (de265_error)(-concealedPicture); } RefPicSetStCurrAfter[i] = k = concealedPicture; picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture picInAnyList[concealedPicture]=true; //printf(" concealed: %d\n", concealedPicture); } if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; } } for (int i=0;i=0) picInAnyList[k]=true; } // 4. any picture that is not marked for reference is put into the "UnusedForReference" state for (size_t i=0;i=picInAnyList.size() || !picInAnyList[i]) // no reference { de265_image* dpbimg = dpb.get_image(i); if (dpbimg != img && // not the current picture dpbimg->removed_at_picture_id > img->get_ID()) // has not been removed before { if (dpbimg->PicState != UnusedForReference) { removeReferencesList.push_back(dpbimg->get_ID()); //printf("will remove ID %d (b)\n",dpbimg->get_ID()); dpbimg->removed_at_picture_id = img->get_ID(); } } } hdr->RemoveReferencesList = removeReferencesList; //remove_images_from_dpb(hdr->RemoveReferencesList); return DE265_OK; } // 8.3.4 // Returns whether we can continue decoding (or whether there is a severe error). /* Called at beginning of each slice. Constructs - the RefPicList[2][], containing indices into the DPB, and - the RefPicList_POC[2][], containing POCs. - LongTermRefPic[2][] is also set to true if it is a long-term reference */ bool decoder_context::construct_reference_picture_lists(slice_segment_header* hdr) { int NumPocTotalCurr = hdr->NumPocTotalCurr; int NumRpsCurrTempList0 = libde265_max(hdr->num_ref_idx_l0_active, NumPocTotalCurr); // TODO: fold code for both lists together int RefPicListTemp0[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ? int RefPicListTemp1[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ? char isLongTerm[2][3*MAX_NUM_REF_PICS]; memset(isLongTerm,0,2*3*MAX_NUM_REF_PICS); /* --- Fill RefPicListTmp0 with reference pictures in this order: 1) short term, past POC 2) short term, future POC 3) long term */ int rIdx=0; while (rIdx < NumRpsCurrTempList0) { for (int i=0;inum_ref_idx_l0_active > 16) { add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); return false; } */ assert(hdr->num_ref_idx_l0_active <= 16); for (rIdx=0; rIdxnum_ref_idx_l0_active; rIdx++) { int idx = hdr->ref_pic_list_modification_flag_l0 ? hdr->list_entry_l0[rIdx] : rIdx; if (idx >= NumRpsCurrTempList0) { add_warning(DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST, false); return false; } hdr->RefPicList[0][rIdx] = RefPicListTemp0[idx]; hdr->LongTermRefPic[0][rIdx] = isLongTerm[0][idx]; // remember POC of referenced image (needed in motion.c, derive_collocated_motion_vector) de265_image* img_0_rIdx = dpb.get_image(hdr->RefPicList[0][rIdx]); if (img_0_rIdx==nullptr) { return false; } hdr->RefPicList_POC[0][rIdx] = img_0_rIdx->PicOrderCntVal; hdr->RefPicList_PicState[0][rIdx] = img_0_rIdx->PicState; } /* --- Fill RefPicListTmp1 with reference pictures in this order: 1) short term, future POC 2) short term, past POC 3) long term */ if (hdr->slice_type == SLICE_TYPE_B) { int NumRpsCurrTempList1 = libde265_max(hdr->num_ref_idx_l1_active, NumPocTotalCurr); int rIdx=0; while (rIdx < NumRpsCurrTempList1) { for (int i=0;inum_ref_idx_l0_active > 16) { add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); return false; } assert(hdr->num_ref_idx_l1_active <= 16); for (rIdx=0; rIdxnum_ref_idx_l1_active; rIdx++) { int idx = hdr->ref_pic_list_modification_flag_l1 ? hdr->list_entry_l1[rIdx] : rIdx; if (idx >= NumRpsCurrTempList1) { add_warning(DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST, false); return false; } hdr->RefPicList[1][rIdx] = RefPicListTemp1[idx]; hdr->LongTermRefPic[1][rIdx] = isLongTerm[1][idx]; // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector) de265_image* img_1_rIdx = dpb.get_image(hdr->RefPicList[1][rIdx]); if (img_1_rIdx == nullptr) { return false; } hdr->RefPicList_POC[1][rIdx] = img_1_rIdx->PicOrderCntVal; hdr->RefPicList_PicState[1][rIdx] = img_1_rIdx->PicState; } } // show reference picture lists loginfo(LogHeaders,"RefPicList[0] ="); for (rIdx=0; rIdxnum_ref_idx_l0_active; rIdx++) { loginfo(LogHeaders,"* [%d]=%d (LT=%d)", hdr->RefPicList[0][rIdx], hdr->RefPicList_POC[0][rIdx], hdr->LongTermRefPic[0][rIdx] ); } loginfo(LogHeaders,"*\n"); if (hdr->slice_type == SLICE_TYPE_B) { loginfo(LogHeaders,"RefPicList[1] ="); for (rIdx=0; rIdxnum_ref_idx_l1_active; rIdx++) { loginfo(LogHeaders,"* [%d]=%d (LT=%d)", hdr->RefPicList[1][rIdx], hdr->RefPicList_POC[1][rIdx], hdr->LongTermRefPic[1][rIdx] ); } loginfo(LogHeaders,"*\n"); } return true; } void decoder_context::run_postprocessing_filters_sequential(de265_image* img) { #if SAVE_INTERMEDIATE_IMAGES char buf[1000]; sprintf(buf,"pre-lf-%05d.yuv", img->PicOrderCntVal); write_picture_to_file(img, buf); #endif if (!img->decctx->param_disable_deblocking) { apply_deblocking_filter(img); } #if SAVE_INTERMEDIATE_IMAGES sprintf(buf,"pre-sao-%05d.yuv", img->PicOrderCntVal); write_picture_to_file(img, buf); #endif if (!img->decctx->param_disable_sao) { apply_sample_adaptive_offset_sequential(img); } #if SAVE_INTERMEDIATE_IMAGES sprintf(buf,"sao-%05d.yuv", img->PicOrderCntVal); write_picture_to_file(img, buf); #endif } void decoder_context::run_postprocessing_filters_parallel(image_unit* imgunit) { de265_image* img = imgunit->img; int saoWaitsForProgress = CTB_PROGRESS_PREFILTER; bool waitForCompletion = false; if (!img->decctx->param_disable_deblocking) { add_deblocking_tasks(imgunit); saoWaitsForProgress = CTB_PROGRESS_DEBLK_H; } if (!img->decctx->param_disable_sao) { waitForCompletion |= add_sao_tasks(imgunit, saoWaitsForProgress); //apply_sample_adaptive_offset(img); } // The original intention was to skip wait_for_completion() if there is no SAO task, // but it does not work as intended. (TODO: check why) (void)waitForCompletion; img->wait_for_completion(); } /* void decoder_context::push_current_picture_to_output_queue() { push_picture_to_output_queue(img); } */ de265_error decoder_context::push_picture_to_output_queue(image_unit* imgunit) { de265_image* outimg = imgunit->img; if (outimg==nullptr) { return DE265_OK; } // push image into output queue if (outimg->PicOutputFlag) { loginfo(LogDPB,"new picture has output-flag=true\n"); if (outimg->integrity != INTEGRITY_CORRECT && param_suppress_faulty_pictures) { } else { dpb.insert_image_into_reorder_buffer(outimg); } loginfo(LogDPB,"push image %d into reordering queue\n", outimg->PicOrderCntVal); } // check for full reorder buffers int maxNumPicsInReorderBuffer = 0; // TODO: I'd like to have the has_vps() check somewhere else (not decode the picture at all) if (outimg->has_vps()) { int sublayer = outimg->get_vps().vps_max_sub_layers -1; maxNumPicsInReorderBuffer = outimg->get_vps().layer[sublayer].vps_max_num_reorder_pics; } if (dpb.num_pictures_in_reorder_buffer() > maxNumPicsInReorderBuffer) { dpb.output_next_picture_in_reorder_buffer(); } dpb.log_dpb_queues(); return DE265_OK; } // returns whether we can continue decoding the stream or whether we should give up bool decoder_context::process_slice_segment_header(slice_segment_header* hdr, de265_error* err, de265_PTS pts, nal_header* nal_hdr, void* user_data) { *err = DE265_OK; flush_reorder_buffer_at_this_frame = false; // get PPS and SPS for this slice int pps_id = hdr->slice_pic_parameter_set_id; if (pps[pps_id]==nullptr || pps[pps_id]->pps_read==false) { logerror(LogHeaders, "PPS %d has not been read\n", pps_id); img->decctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); return false; } current_pps = pps[pps_id]; current_sps = sps[ (int)current_pps->seq_parameter_set_id ]; current_vps = vps[ (int)current_sps->video_parameter_set_id ]; calc_tid_and_framerate_ratio(); // --- prepare decoding of new picture --- if (hdr->first_slice_segment_in_pic_flag) { // previous picture has been completely decoded //ctx->push_current_picture_to_output_queue(); current_image_poc_lsb = hdr->slice_pic_order_cnt_lsb; seq_parameter_set* sps = current_sps.get(); // --- find and allocate image buffer for decoding --- int image_buffer_idx; bool isOutputImage = (!sps->sample_adaptive_offset_enabled_flag || param_disable_sao); image_buffer_idx = dpb.new_image(current_sps, this, pts, user_data, isOutputImage); if (image_buffer_idx < 0) { *err = (de265_error)(-image_buffer_idx); return false; } /*de265_image* */ img = dpb.get_image(image_buffer_idx); img->nal_hdr = *nal_hdr; // Note: sps is already set in new_image() -> ??? still the case with shared_ptr ? img->set_headers(current_vps, current_sps, current_pps); img->decctx = this; img->clear_metadata(); if (isIRAP(nal_unit_type)) { if (isIDR(nal_unit_type) || isBLA(nal_unit_type) || first_decoded_picture || FirstAfterEndOfSequenceNAL) { NoRaslOutputFlag = true; FirstAfterEndOfSequenceNAL = false; } else if (0) // TODO: set HandleCraAsBlaFlag by external means { } else { NoRaslOutputFlag = false; HandleCraAsBlaFlag = false; } } if (isRASL(nal_unit_type) && NoRaslOutputFlag) { img->PicOutputFlag = false; } else { img->PicOutputFlag = !!hdr->pic_output_flag; } process_picture_order_count(hdr); if (hdr->first_slice_segment_in_pic_flag) { // mark picture so that it is not overwritten by unavailable reference frames img->PicState = UsedForShortTermReference; *err = process_reference_picture_set(hdr); if (*err != DE265_OK) { return false; } } img->PicState = UsedForShortTermReference; log_set_current_POC(img->PicOrderCntVal); // next image is not the first anymore first_decoded_picture = false; } else { // claims to be not the first slice, but there is no active image available if (img == nullptr) { return false; } } if (hdr->slice_type == SLICE_TYPE_B || hdr->slice_type == SLICE_TYPE_P) { bool success = construct_reference_picture_lists(hdr); if (!success) { return false; } } //printf("process slice segment header\n"); loginfo(LogHeaders,"end of process-slice-header\n"); dpb.log_dpb_content(); if (hdr->dependent_slice_segment_flag==0) { hdr->SliceAddrRS = hdr->slice_segment_address; } else { hdr->SliceAddrRS = previous_slice_header->SliceAddrRS; } previous_slice_header = hdr; loginfo(LogHeaders,"SliceAddrRS = %d\n",hdr->SliceAddrRS); return true; } void decoder_context::remove_images_from_dpb(const std::vector& removeImageList) { for (size_t i=0;i=0) { //printf("remove ID %d\n", removeImageList[i]); de265_image* dpbimg = dpb.get_image( idx ); dpbimg->PicState = UnusedForReference; } } } /* . 0 1 2 <- goal_HighestTid +-----+-----+-----+ | -0->| -1->| -2->| +-----+-----+-----+ 0 33 66 100 <- framerate_ratio */ int decoder_context::get_highest_TID() const { if (current_sps) { return current_sps->sps_max_sub_layers-1; } if (current_vps) { return current_vps->vps_max_sub_layers-1; } return 6; } void decoder_context::set_limit_TID(int max_tid) { limit_HighestTid = max_tid; calc_tid_and_framerate_ratio(); } int decoder_context::change_framerate(int more) { if (current_sps == nullptr) { return framerate_ratio; } int highestTid = get_highest_TID(); assert(more>=-1 && more<=1); goal_HighestTid += more; goal_HighestTid = std::max(goal_HighestTid, 0); goal_HighestTid = std::min(goal_HighestTid, highestTid); framerate_ratio = framedrop_tid_index[goal_HighestTid]; calc_tid_and_framerate_ratio(); return framerate_ratio; } void decoder_context::set_framerate_ratio(int percent) { framerate_ratio = percent; calc_tid_and_framerate_ratio(); } void decoder_context::compute_framedrop_table() { int highestTID = get_highest_TID(); for (int tid=highestTID ; tid>=0 ; tid--) { int lower = 100 * tid /(highestTID+1); int higher = 100 * (tid+1)/(highestTID+1); for (int l=lower; l<=higher; l++) { int ratio = 100 * (l-lower) / (higher-lower); // if we would exceed our TID limit, decode the highest TID at full frame-rate if (tid > limit_HighestTid) { tid = limit_HighestTid; ratio = 100; } framedrop_tab[l].tid = tid; framedrop_tab[l].ratio = ratio; } framedrop_tid_index[tid] = higher; } #if 0 for (int i=0;i<=100;i++) { printf("%d%%: %d/%d",i, framedrop_tab[i].tid, framedrop_tab[i].ratio); for (int k=0;k<=highestTID;k++) { if (framedrop_tid_index[k] == i) printf(" ** TID=%d **",k); } printf("\n"); } #endif } void decoder_context::calc_tid_and_framerate_ratio() { int highestTID = get_highest_TID(); // if number of temporal layers changed, we have to recompute the framedrop table if (framedrop_tab[100].tid != highestTID) { compute_framedrop_table(); } goal_HighestTid = framedrop_tab[framerate_ratio].tid; layer_framerate_ratio = framedrop_tab[framerate_ratio].ratio; // TODO: for now, we switch immediately current_HighestTid = goal_HighestTid; } void error_queue::add_warning(de265_error warning, bool once) { std::lock_guard lock(m_mutex); // check if warning was already shown if (once) { if (std::find(warnings_shown.begin(), warnings_shown.end(), warning) != warnings_shown.end()) { return; } warnings_shown.push_back(warning); } // add warning to output queue if (warnings.size() >= MAX_WARNINGS) { warnings.back() = DE265_WARNING_WARNING_BUFFER_FULL; return; } warnings.push_back(warning); } de265_error error_queue::get_warning() { std::lock_guard lock(m_mutex); if (warnings.empty()) { return DE265_OK; } de265_error warn = warnings.front(); warnings.erase(warnings.begin()); return warn; } libde265-1.0.18/libde265/decctx.h000066400000000000000000000360211515675107500160720ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_DECCTX_H #define DE265_DECCTX_H #include "libde265/vps.h" #include "libde265/sps.h" #include "libde265/pps.h" #include "libde265/nal.h" #include "libde265/slice.h" #include "libde265/image.h" #include "libde265/motion.h" #include "libde265/de265.h" #include "libde265/dpb.h" #include "libde265/sei.h" #include "libde265/threads.h" #include "libde265/acceleration.h" #include "libde265/nal-parser.h" #include constexpr int DE265_MAX_VPS_SETS = 16; // this is the maximum as defined in the standard constexpr int DE265_MAX_SPS_SETS = 16; // this is the maximum as defined in the standard constexpr int DE265_MAX_PPS_SETS = 64; // this is the maximum as defined in the standard constexpr int MAX_WARNINGS = 20; class slice_segment_header; class image_unit; class slice_unit; class decoder_context; class thread_context { public: thread_context(); uint32_t CtbAddrInRS; uint32_t CtbAddrInTS; uint16_t CtbX, CtbY; // motion vectors PBMotionCoding motion; // prediction // enum IntraPredMode IntraPredModeC[4]; // chroma intra-prediction mode for current CB int ResScaleVal; // residual data uint8_t cu_transquant_bypass_flag; uint8_t transform_skip_flag[3]; uint8_t explicit_rdpcm_flag; uint8_t explicit_rdpcm_dir; // we need 16 bytes of extra memory (8*int16) to shift the base for the // alignment required for SSE code ! int16_t _coeffBuf[(32*32)+8]; int16_t *coeffBuf; // the base pointer for into _coeffBuf, aligned to 16 bytes int16_t coeffList[3][32*32]; int16_t coeffPos[3][32*32]; int16_t nCoeff[3]; int32_t residual_luma[32*32]; // only used when cross-comp-prediction is enabled // quantization int IsCuQpDeltaCoded = 0; int CuQpDelta = 0; int IsCuChromaQpOffsetCoded = 0; int CuQpOffsetCb = 0, CuQpOffsetCr = 0; int currentQPY; int currentQG_x, currentQG_y; int lastQPYinPreviousQG; int qPYPrime, qPCbPrime, qPCrPrime; CABAC_decoder cabac_decoder; context_model_table ctx_model; uint8_t StatCoeff[4]; decoder_context* decctx = nullptr; struct de265_image *img = nullptr; slice_segment_header* shdr = nullptr; image_unit* imgunit = nullptr; slice_unit* sliceunit = nullptr; thread_task* task; // executing thread_task or nullptr if not multi-threaded thread_context(const thread_context&) = delete; thread_context& operator=(const thread_context&) = delete; }; class error_queue { public: void add_warning(de265_error warning, bool once); de265_error get_warning(); private: std::mutex m_mutex; std::vector warnings; std::vector warnings_shown; // warnings that have already occurred }; class slice_unit { public: slice_unit(decoder_context* decctx); ~slice_unit(); NAL_unit* nal; // we are the owner slice_segment_header* shdr; // not the owner (de265_image is owner) bitreader reader; image_unit* imgunit; bool flush_reorder_buffer; // decoding status enum SliceDecodingProgress { Unprocessed, InProgress, Decoded } state; de265_progress_lock finished_threads; int nThreads; int first_decoded_CTB_RS; // TODO int last_decoded_CTB_RS; // TODO void allocate_thread_contexts(int n); thread_context* get_thread_context(int n) { assert(n < nThreadContexts); return &thread_contexts[n]; } int num_thread_contexts() const { return nThreadContexts; } private: thread_context* thread_contexts; /* NOTE: cannot use std::vector, because thread_context has no copy constructor. */ int nThreadContexts; public: decoder_context* ctx; slice_unit(const slice_unit&) = delete; slice_unit& operator=(const slice_unit&) = delete; }; class image_unit { public: image_unit(); ~image_unit(); de265_image* img = nullptr; de265_image sao_output; // if SAO is used, this is allocated and used as SAO output buffer std::vector slice_units; std::vector suffix_SEIs; slice_unit* get_next_unprocessed_slice_segment() const { for (size_t i=0;istate == slice_unit::Unprocessed) { return slice_units[i]; } } return nullptr; } slice_unit* get_prev_slice_segment(slice_unit* s) const { for (size_t i=1; i(slice_units[i])); } } bool all_slice_segments_processed() const { if (slice_units.size()==0) return true; if (slice_units.back()->state != slice_unit::Unprocessed) return true; return false; } bool is_first_slice_segment(const slice_unit* s) const { if (slice_units.size()==0) return false; return (slice_units[0] == s); } enum { Invalid, // headers not read yet Unknown, // SPS/PPS available Reference, // will be used as reference Leaf // not a reference picture } role = Invalid; enum { Unprocessed, InProgress, Decoded, Dropped // will not be decoded } state = Unprocessed; std::vector tasks; // we are the owner /* Saved context models for WPP. There is one saved model for the initialization of each CTB row. The array is unused for non-WPP streams. */ std::vector ctx_models; // TODO: move this into image ? }; class base_context : public error_queue { public: base_context(); virtual ~base_context() { } // --- accelerated DSP functions --- void set_acceleration_functions(enum de265_acceleration); struct acceleration_functions acceleration; // CPU optimized functions //virtual /* */ de265_image* get_image(uint16_t dpb_index) { return dpb.get_image(dpb_index); } virtual const de265_image* get_image(uint16_t frame_id) const = 0; virtual bool has_image(uint16_t frame_id) const = 0; }; class decoder_context : public base_context { public: decoder_context(); ~decoder_context(); de265_error start_thread_pool(int nThreads); void stop_thread_pool(); void reset(); bool has_sps(int id) const { return sps[id] != nullptr; } bool has_pps(int id) const { return pps[id] != nullptr; } std::shared_ptr get_shared_sps(int id) { return sps[id]; } std::shared_ptr get_shared_pps(int id) { return pps[id]; } /* */ seq_parameter_set* get_sps(int id) { return sps[id].get(); } const seq_parameter_set* get_sps(int id) const { return sps[id].get(); } /* */ pic_parameter_set* get_pps(int id) { return pps[id].get(); } const pic_parameter_set* get_pps(int id) const { return pps[id].get(); } /* const slice_segment_header* get_SliceHeader_atCtb(int ctb) { return img->slices[img->get_SliceHeaderIndex_atIndex(ctb)]; } */ uint8_t get_nal_unit_type() const { return nal_unit_type; } bool get_RapPicFlag() const { return RapPicFlag; } de265_error decode_NAL(NAL_unit* nal); de265_error decode(int* more); de265_error decode_some(bool* did_work); de265_error decode_slice_unit_sequential(image_unit* imgunit, slice_unit* sliceunit); de265_error decode_slice_unit_parallel(image_unit* imgunit, slice_unit* sliceunit); de265_error decode_slice_unit_WPP(image_unit* imgunit, slice_unit* sliceunit); de265_error decode_slice_unit_tiles(image_unit* imgunit, slice_unit* sliceunit); void process_nal_hdr(nal_header*); bool process_slice_segment_header(slice_segment_header*, de265_error*, de265_PTS pts, nal_header* nal_hdr, void* user_data); //void push_current_picture_to_output_queue(); de265_error push_picture_to_output_queue(image_unit*); // --- parameters --- bool param_sei_check_hash = false; bool param_conceal_stream_errors = true; bool param_suppress_faulty_pictures = false; int param_sps_headers_fd = -1; int param_vps_headers_fd = -1; int param_pps_headers_fd = -1; int param_slice_headers_fd = -1; bool param_disable_deblocking = false; bool param_disable_sao = false; //bool param_disable_mc_residual_idct; // not implemented yet //bool param_disable_intra_residual_idct; // not implemented yet void set_image_allocation_functions(de265_image_allocation* allocfunc, void* userdata); de265_image_allocation param_image_allocation_functions; // initialized in constructor void* param_image_allocation_userdata = nullptr; // --- input stream data --- NAL_Parser nal_parser; int get_num_worker_threads() const { return num_worker_threads; } /* */ de265_image* get_image(uint16_t dpb_index) { return dpb.get_image(dpb_index); } const de265_image* get_image(uint16_t dpb_index) const override { return dpb.get_image(dpb_index); } bool has_image(uint16_t dpb_index) const override { return dpb_index vps[ DE265_MAX_VPS_SETS ]; std::shared_ptr sps[ DE265_MAX_SPS_SETS ]; std::shared_ptr pps[ DE265_MAX_PPS_SETS ]; std::shared_ptr current_vps; std::shared_ptr current_sps; std::shared_ptr current_pps; public: thread_pool thread_pool_; private: int num_worker_threads = 0; public: // --- frame dropping --- void set_limit_TID(int tid); int get_highest_TID() const; int get_current_TID() const { return current_HighestTid; } int change_framerate(int more_vs_less); // 1: more, -1: less void set_framerate_ratio(int percent); private: // input parameters int limit_HighestTid = 6; // never switch to a layer above this one int framerate_ratio = 100; // current control parameters int goal_HighestTid = 6; // this is the layer we want to decode at int layer_framerate_ratio = 100; // ratio of frames to keep in the current layer int current_HighestTid = 6; // the layer which we are currently decoding struct { int8_t tid; int8_t ratio; } framedrop_tab[100+1]; int framedrop_tid_index[6+1]; void compute_framedrop_table(); void calc_tid_and_framerate_ratio(); private: // --- decoded picture buffer --- decoded_picture_buffer dpb; int current_image_poc_lsb = -1; bool first_decoded_picture = true; bool NoRaslOutputFlag = false; bool HandleCraAsBlaFlag = false; bool FirstAfterEndOfSequenceNAL = false; int PicOrderCntMsb = 0; int prevPicOrderCntLsb = 0; // at precTid0Pic int prevPicOrderCntMsb = 0; // at precTid0Pic de265_image* img = nullptr; public: const slice_segment_header* previous_slice_header = nullptr; /* Remember the last slice for a successive dependent slice. */ // --- motion compensation --- public: int PocLsbLt[MAX_NUM_REF_PICS]{}; int UsedByCurrPicLt[MAX_NUM_REF_PICS]{}; uint32_t DeltaPocMsbCycleLt[MAX_NUM_REF_PICS]{}; private: int CurrDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS]{}; int FollDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS]{}; // The number of entries in the lists below. int NumPocStCurrBefore = 0; int NumPocStCurrAfter = 0; int NumPocStFoll = 0; int NumPocLtCurr = 0; int NumPocLtFoll = 0; // These lists contain absolute POC values. int PocStCurrBefore[MAX_NUM_REF_PICS]{}; // used for reference in current picture, smaller POC int PocStCurrAfter[MAX_NUM_REF_PICS]{}; // used for reference in current picture, larger POC int PocStFoll[MAX_NUM_REF_PICS]{}; // not used for reference in current picture, but in future picture int PocLtCurr[MAX_NUM_REF_PICS]{}; // used in current picture int PocLtFoll[MAX_NUM_REF_PICS]{}; // used in some future picture // These lists contain indices into the DPB. int RefPicSetStCurrBefore[MAX_NUM_REF_PICS]{}; int RefPicSetStCurrAfter[MAX_NUM_REF_PICS]{}; int RefPicSetStFoll[MAX_NUM_REF_PICS]{}; int RefPicSetLtCurr[MAX_NUM_REF_PICS]{}; int RefPicSetLtFoll[MAX_NUM_REF_PICS]{}; // --- parameters derived from parameter sets --- // NAL uint8_t nal_unit_type = 0; bool IdrPicFlag = false; bool RapPicFlag = false; // --- image unit queue --- std::vector image_units; bool flush_reorder_buffer_at_this_frame = false; private: void init_thread_context(thread_context* tctx); void add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream, int ctbRow); void add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream, int ctbX,int ctbY); void mark_whole_slice_as_processed(image_unit* imgunit, slice_unit* sliceunit, int progress); void process_picture_order_count(slice_segment_header* hdr); /* If there is no space for a new image, returns the negative value of an de265_error. I.e. you can check for error by return_value<0, which is error (-return_value); */ int generate_unavailable_reference_picture(const seq_parameter_set* sps, int POC, bool longTerm); de265_error process_reference_picture_set(slice_segment_header* hdr); bool construct_reference_picture_lists(slice_segment_header* hdr); void remove_images_from_dpb(const std::vector& removeImageList); void run_postprocessing_filters_sequential(struct de265_image* img); void run_postprocessing_filters_parallel(image_unit* img); }; #endif libde265-1.0.18/libde265/dpb.cc000066400000000000000000000176331515675107500155330ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "dpb.h" #include "decctx.h" #include #include decoded_picture_buffer::~decoded_picture_buffer() { for (size_t i=0;iPicOrderCntVal, dpb[i]->get_ID(), dpb[i]->PicState == UnusedForReference ? "unused" : dpb[i]->PicState == UsedForShortTermReference ? "short-term" : "long-term", dpb[i]->PicOutputFlag ? "output" : "---"); } } bool decoded_picture_buffer::has_free_dpb_picture(bool high_priority) const { // we will always adapt the buffer to insert high-priority images if (high_priority) return true; // quick test to check for free slots if (dpb.size() < max_images_in_DPB) return true; // scan for empty slots for (size_t i=0;iPicOutputFlag==false && dpb[i]->PicState == UnusedForReference) { return true; } } return false; } int decoded_picture_buffer::DPB_index_of_picture_with_POC(int poc, uint32_t currentID, bool preferLongTerm) const { logdebug(LogHeaders,"DPB_index_of_picture_with_POC POC=%d\n",poc); //log_dpb_content(ctx); //loginfo(LogDPB,"searching for short-term reference POC=%d\n",poc); if (preferLongTerm) { for (size_t k=0;kPicOrderCntVal == poc && dpb[k]->removed_at_picture_id > currentID && dpb[k]->PicState == UsedForLongTermReference) { return k; } } } for (size_t k=0;kPicOrderCntVal == poc && dpb[k]->removed_at_picture_id > currentID && dpb[k]->PicState != UnusedForReference) { return k; } } return -1; } int decoded_picture_buffer::DPB_index_of_picture_with_LSB(int lsb, uint32_t currentID, bool preferLongTerm) const { logdebug(LogHeaders,"get access to picture with LSB %d from DPB\n",lsb); if (preferLongTerm) { for (size_t k=0;kpicture_order_cnt_lsb == lsb && dpb[k]->removed_at_picture_id > currentID && dpb[k]->PicState == UsedForLongTermReference) { return k; } } } for (size_t k=0;kpicture_order_cnt_lsb == lsb && dpb[k]->removed_at_picture_id > currentID && dpb[k]->PicState != UnusedForReference) { return k; } } return -1; } int decoded_picture_buffer::DPB_index_of_picture_with_ID(uint32_t id) const { logdebug(LogHeaders,"get access to picture with ID %d from DPB\n",id); for (size_t k=0;kget_ID() == id) { return k; } } return -1; } void decoded_picture_buffer::output_next_picture_in_reorder_buffer() { assert(!reorder_output_queue.empty()); // search for picture in reorder buffer with minimum POC int minPOC = reorder_output_queue[0]->PicOrderCntVal; int minIdx = 0; for (size_t i=1;iPicOrderCntVal < minPOC) { minPOC = reorder_output_queue[i]->PicOrderCntVal; minIdx = i; } } // put image into output queue image_output_queue.push_back(reorder_output_queue[minIdx]); // remove image from reorder buffer reorder_output_queue[minIdx] = reorder_output_queue.back(); reorder_output_queue.pop_back(); } bool decoded_picture_buffer::flush_reorder_buffer() { // return 'false' when there are no pictures in reorder buffer if (reorder_output_queue.empty()) return false; while (!reorder_output_queue.empty()) { output_next_picture_in_reorder_buffer(); } return true; } void decoded_picture_buffer::clear() { for (size_t i=0;iPicOutputFlag || dpb[i]->PicState != UnusedForReference) { dpb[i]->PicOutputFlag = false; dpb[i]->PicState = UnusedForReference; dpb[i]->release(); } } reorder_output_queue.clear(); image_output_queue.clear(); } int decoded_picture_buffer::new_image(std::shared_ptr sps, decoder_context* decctx, de265_PTS pts, void* user_data, bool isOutputImage) { loginfo(LogHeaders,"DPB::new_image\n"); log_dpb_content(); // --- search for a free slot in the DPB --- uint8_t free_image_buffer_idx = 0; uint8_t err = DE265_ERROR_IMAGE_BUFFER_FULL; for (size_t i=0;ican_be_released()) { dpb[i]->release(); /* TODO: this is surely not the best place to free the image, but we have to do it here because releasing it in de265_release_image() would break the API compatibility. */ free_image_buffer_idx = i; err = DE265_OK; break; } } // Try to free a buffer at the end if the DPB got too large. /* This should also probably move to a better place as soon as the API allows for this. */ if (dpb.size() > norm_images_in_DPB && // buffer too large free_image_buffer_idx != dpb.size()-1 && // last slot not reused in this alloc dpb.back()->can_be_released()) // last slot is free { delete dpb.back(); dpb.pop_back(); } // create a new image slot if no empty slot remaining if (err == DE265_ERROR_IMAGE_BUFFER_FULL) { size_t dpb_size = dpb.size(); assert(dpb_size < 255); free_image_buffer_idx = static_cast(dpb_size); dpb.push_back(new de265_image); err = DE265_OK; } // --- allocate new image --- if (err) { return -err; } de265_image* img = dpb[free_image_buffer_idx]; int w = sps->pic_width_in_luma_samples; int h = sps->pic_height_in_luma_samples; enum de265_chroma chroma; switch (sps->chroma_format_idc) { case 0: chroma = de265_chroma_mono; break; case 1: chroma = de265_chroma_420; break; case 2: chroma = de265_chroma_422; break; case 3: chroma = de265_chroma_444; break; default: chroma = de265_chroma_420; assert(0); break; // should never happen } de265_error error = img->alloc_image(w,h, chroma, sps, true, decctx, /*nullptr,*/ pts, user_data, isOutputImage); if (error) { return -error; } img->integrity = INTEGRITY_CORRECT; return free_image_buffer_idx; } void decoded_picture_buffer::pop_next_picture_in_output_queue() { image_output_queue.pop_front(); loginfo(LogDPB, "DPB output queue: "); for (size_t i=0;iPicOrderCntVal); } loginfo(LogDPB,"*\n"); } void decoded_picture_buffer::log_dpb_queues() const { loginfo(LogDPB, "DPB reorder queue (after push): "); for (int i=0;iPicOrderCntVal); } loginfo(LogDPB,"*\n"); loginfo(LogDPB, "DPB output queue (after push): "); for (int i=0;iPicOrderCntVal); } loginfo(LogDPB,"*\n"); } libde265-1.0.18/libde265/dpb.h000066400000000000000000000077111515675107500153710ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_DPB_H #define DE265_DPB_H #include "libde265/image.h" #include "libde265/sps.h" #include #include class decoder_context; class decoded_picture_buffer { public: decoded_picture_buffer() = default; ~decoded_picture_buffer(); void set_max_size_of_DPB(uint8_t n) { max_images_in_DPB=n; } void set_norm_size_of_DPB(uint8_t n) { norm_images_in_DPB=n; } /* Alloc a new image in the DPB and return its index. If there is no space for a new image, returns the negative value of an de265_error. I.e. you can check for error by return_value<0, which is error (-return_value); */ int new_image(std::shared_ptr sps, decoder_context* decctx, de265_PTS pts, void* user_data, bool isOutputImage); /* Check for a free slot in the DPB. There are some slots reserved for unavailable reference frames. If high_priority==true, these reserved slots are included in the check. */ bool has_free_dpb_picture(bool high_priority) const; /* Remove all pictures from DPB and queues. Decoding should be stopped while calling this. */ void clear(); size_t size() const { return dpb.size(); } /* Raw access to the images. */ /* */ de265_image* get_image(uint16_t index) { if (index>=dpb.size()) return nullptr; return dpb[index]; } const de265_image* get_image(uint16_t index) const { if (index>=dpb.size()) return nullptr; return dpb[index]; } /* Search DPB for the slot index of a specific picture. */ int DPB_index_of_picture_with_POC(int poc, uint32_t currentID, bool preferLongTerm=false) const; int DPB_index_of_picture_with_LSB(int lsb, uint32_t currentID, bool preferLongTerm=false) const; int DPB_index_of_picture_with_ID (uint32_t id) const; // --- reorder buffer --- void insert_image_into_reorder_buffer(struct de265_image* img) { reorder_output_queue.push_back(img); } int num_pictures_in_reorder_buffer() const { return reorder_output_queue.size(); } // move next picture in reorder buffer to output queue void output_next_picture_in_reorder_buffer(); // Move all pictures in reorder buffer to output buffer. Return true if there were any pictures. bool flush_reorder_buffer(); // --- output buffer --- int num_pictures_in_output_queue() const { return image_output_queue.size(); } /* Get the next picture in the output queue, but do not remove it from the queue. */ struct de265_image* get_next_picture_in_output_queue() const { return image_output_queue.front(); } /* Remove the next picture in the output queue. */ void pop_next_picture_in_output_queue(); // --- debug --- void log_dpb_content() const; void log_dpb_queues() const; private: static const int DPB_DEFAULT_MAX_IMAGES = 30; uint8_t max_images_in_DPB = DPB_DEFAULT_MAX_IMAGES; uint8_t norm_images_in_DPB = DPB_DEFAULT_MAX_IMAGES; std::vector dpb; // decoded picture buffer std::vector reorder_output_queue; std::deque image_output_queue; decoded_picture_buffer(const decoded_picture_buffer&) = delete; decoded_picture_buffer& operator=(const decoded_picture_buffer&) = delete; }; #endif libde265-1.0.18/libde265/fallback-dct.cc000066400000000000000000000753531515675107500173000ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "fallback-dct.h" #if defined(_MSC_VER) || defined(__MINGW32__) # include #elif defined(HAVE_ALLOCA_H) # include #endif #include #include #if 0 static void printMatrix(const char* name, const int16_t* v, int n) { printf("--- %s ---\n",name); for (int r=0;r>bdShift2; dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c); } } void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) { int nT = 4; int bdShift2 = 20-bit_depth; assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size for (int y=0;y>bdShift2; dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth); } } void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT, int tsShift,int bdShift) { const int rnd = 1<<(bdShift-1); for (int y=0;y> bdShift; } } void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) { int bitDepth = 8; int bdShift2 = 20-bitDepth; int offset = (1<<(bdShift2-1)); int tsShift = 5 + log2nT; // TODO: extended_precision int nT = 1<>bdShift2; dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); } } } void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) { int bitDepth = 8; int bdShift2 = 20-bitDepth; int offset = (1<<(bdShift2-1)); int tsShift = 5 + log2nT; // TODO: extended_precision int nT = 1<>bdShift2; dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); } } } void transform_bypass_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride) { for (int x=0;x>bdShift; residual[y*nT+x] = sum; } } } void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift) { int rnd = (1<<(bdShift-1)); for (int y=0;y>bdShift; residual[y*nT+x] = sum; } } } void transform_bypass_fallback(int32_t *dst, const int16_t *coeffs, int nT) { for (int y=0;y "); */ for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[j][i] * coeffs[c+j*4]; } g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); } /* for (int y=0;y<4;y++) { logtrace(LogTransform,"*%d ",g[y][c]); } logtrace(LogTransform,"*\n"); */ } // --- H --- for (int y=0;y<4;y++) { /* logtrace(LogTransform,"DST-H: "); for (int c=0;c<4;c++) { logtrace(LogTransform,"%d ",g[y][c]); } logtrace(LogTransform,"* -> "); */ for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[j][i] * g[y][j]; } int out = Clip3(-32768,32767, (sum+rndH)>>postShift); dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out); logtrace(LogTransform,"*%d ",out); } logtrace(LogTransform,"*\n"); } } void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) { int16_t g[4][4]; int postShift = 20-bit_depth; int rndV = 1<<(7-1); int rndH = 1<<(postShift-1); // --- V --- for (int c=0;c<4;c++) { /* logtrace(LogTransform,"DST-V: "); for (int r=0;r<4;r++) { logtrace(LogTransform,"%d ",coeffs[c+r*4]); } logtrace(LogTransform,"* -> "); */ for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[j][i] * coeffs[c+j*4]; } g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); } /* for (int y=0;y<4;y++) { logtrace(LogTransform,"*%d ",g[y][c]); } logtrace(LogTransform,"*\n"); */ } // --- H --- for (int y=0;y<4;y++) { /* logtrace(LogTransform,"DST-H: "); for (int c=0;c<4;c++) { logtrace(LogTransform,"%d ",g[y][c]); } logtrace(LogTransform,"* -> "); */ for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[j][i] * g[y][j]; } int out = Clip3(-32768,32767, (sum+rndH)>>postShift); dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); logtrace(LogTransform,"*%d ",out); } logtrace(LogTransform,"*\n"); } } void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) { int16_t g[4*4]; int BD = 8; int shift1 = Log2(4) + BD -9; int shift2 = Log2(4) + 6; int rnd1 = 1<<(shift1-1); int rnd2 = 1<<(shift2-1); // --- V --- for (int c=0;c<4;c++) { /* logtrace(LogTransform,"DST-V: "); for (int r=0;r<4;r++) { logtrace(LogTransform,"%d ",coeffs[c+r*4]); } logtrace(LogTransform,"* -> "); */ for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[i][j] * input[c+j*stride]; } g[c+4*i] = Clip3(-32768,32767, (sum+rnd1)>>shift1); } } // --- H --- for (int y=0;y<4;y++) { for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[i][j] * g[y*4+j]; } // TODO: do we need clipping ? int out = (sum+rnd2)>>shift2; // Clip3(-32768,32767, (sum+rndH)>>postShift); coeffs[y*4+i] = out; logtrace(LogTransform,"*%d ",out); } logtrace(LogTransform,"*\n"); } } void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) { int16_t g[4][4]; int rndV = 1<<(7-1); int rndH = 1<<(bdShift-1); int CoeffMax = (1<>7); } } // --- H --- for (int y=0;y<4;y++) { for (int i=0;i<4;i++) { int sum=0; for (int j=0;j<4;j++) { sum += mat_8_357[j][i] * g[y][j]; } dst[y*4+i] = (sum + rndH)>>bdShift; } } } static int8_t mat_dct[32][32] = { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90}, { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90, -90,-87,-80,-70,-57,-43,-25, -9, 9, 25, 43, 57, 70, 80, 87, 90}, { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4,-22,-46,-67,-82,-90}, { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89, 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89}, { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22, -22,-61,-85,-90,-73,-38, 4, 46, 78, 90, 82, 54, 13,-31,-67,-88}, { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87, -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43, 9, 57, 87}, { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31, 31, 78, 90, 61, 4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85}, { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83}, { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67, 4, 73, 88, 38, -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82}, { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80, -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70, 9, 80}, { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46, 46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82, 4,-78}, { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75, 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75}, { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54, -54,-85, 4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73}, { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70, -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90, 9,-87,-43, 70}, { 67,-54,-78, 38, 85,-22,-90, 4, 90, 13,-88,-31, 82, 46,-73,-61, 61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67}, { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64}, { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67, -67,-54, 78, 38,-85,-22, 90, 4,-90, 13, 88,-31,-82, 46, 73,-61}, { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57, -57, 80, 25,-90, 9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57}, { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73, 73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88, 4, 85,-54}, { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50, 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50}, { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82, 4, 78, -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46}, { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43, -43, 90,-57,-25, 87,-70, -9, 80,-80, 9, 70,-87, 25, 57,-90, 43}, { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82, 82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67, 4,-73, 88,-38}, { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36}, { 31,-78, 90,-61, 4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85, -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31}, { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25, -25, 70,-90, 80,-43, -9, 57,-87, 87,-57, 9, 43,-80, 90,-70, 25}, { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88, 88,-67, 31, 13,-54, 82,-90, 78,-46, 4, 38,-73, 90,-85, 61,-22}, { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18, 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18}, { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31, 4, 22,-46, 67,-82, 90, -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13}, { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9, -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25, 9}, { 4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90, 90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4} }; template void transform_idct_add(pixel_t *dst, ptrdiff_t stride, int nT, const int16_t *coeffs, int bit_depth) { /* The effective shift is 7 bits right for bit-depth 8, 6 bits right for bit-depth 9, 5 bits right for bit-depth 10. Computation is independent of the block size. Each multiplication with the table includes a left shift of 6 bits. Hence, we have 2* 6 bits = 12 bits left shift. V-pass has fixed 7 bit right shift. H-pass has 20-BitDepth bit right shift; Effective shift 's' means: residual value 1 gives DC-coeff (1< "); */ // find last non-zero coefficient to reduce computations carried out in DCT int lastCol = nT-1; for (;lastCol>=0;lastCol--) { if (coeffs[c+lastCol*nT]) { break; } } for (int i=0;i>7); logtrace(LogTransform,"*%d ",g[c+i*nT]); } logtrace(LogTransform,"*\n"); } /* printf("--- temp\n"); for (int r=0;r "); */ // find last non-zero coefficient to reduce computations carried out in DCT int lastCol = nT-1; for (;lastCol>=0;lastCol--) { if (g[y*nT+lastCol]) { break; } } for (int i=0;i>postShift); int out = (sum+rnd2)>>postShift; //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i); //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i])); dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); logtrace(LogTransform,"*%d ",out); } logtrace(LogTransform,"*\n"); } } void transform_idct_fallback(int32_t *dst, int nT, const int16_t *coeffs, int bdShift, int max_coeff_bits) { /* The effective shift is 7 bits right for bit-depth 8, 6 bits right for bit-depth 9, 5 bits right for bit-depth 10. One transformation with raw transform filter values increases range be 2048 (=32*64). This equals 11 bits. Computation is independent of the block size. Each multiplication with the table includes a left shift of 6 bits. Hence, we have 2* 6 bits = 12 bits left shift. V-pass has fixed 7 bit right shift. H-pass has 20-BitDepth bit right shift; Effective shift 's' means: residual value 1 gives DC-coeff (1< "); */ // find last non-zero coefficient to reduce computations carried out in DCT int lastCol = nT-1; for (;lastCol>=0;lastCol--) { if (coeffs[c+lastCol*nT]) { break; } } for (int i=0;i>7); logtrace(LogTransform,"*%d ",g[c+i*nT]); } logtrace(LogTransform,"*\n"); } /* printf("--- temp\n"); for (int r=0;r "); */ // find last non-zero coefficient to reduce computations carried out in DCT int lastCol = nT-1; for (;lastCol>=0;lastCol--) { if (g[y*nT+lastCol]) { break; } } for (int i=0;i>bdShift; logtrace(LogTransform,"*%d ",sum); } logtrace(LogTransform,"*\n"); } } void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) { transform_idct_fallback(dst,4,coeffs,bdShift,max_coeff_bits); } void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) { transform_idct_fallback(dst,8,coeffs,bdShift,max_coeff_bits); } void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) { transform_idct_fallback(dst,16,coeffs,bdShift,max_coeff_bits); } void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) { transform_idct_fallback(dst,32,coeffs,bdShift,max_coeff_bits); } void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) { transform_idct_add(dst,stride, 4, coeffs, 8); } void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) { transform_idct_add(dst,stride, 8, coeffs, 8); } void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) { transform_idct_add(dst,stride, 16, coeffs, 8); } void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) { transform_idct_add(dst,stride, 32, coeffs, 8); } void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) { transform_idct_add(dst,stride, 4, coeffs, bit_depth); } void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) { transform_idct_add(dst,stride, 8, coeffs, bit_depth); } void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) { transform_idct_add(dst,stride, 16, coeffs, bit_depth); } void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) { transform_idct_add(dst,stride, 32, coeffs, bit_depth); } static void transform_fdct_8(int16_t* coeffs, int nT, const int16_t *input, ptrdiff_t stride) { /* Each sum over a basis vector sums nT elements, which is compensated by shifting right by Log2(nT), effectively dividing by 2^Log2(nT) = nT. Do this in each of the H/V passes. Each multiplication with the table includes a left shift of 6 bits. Hence, we have in total 2* 6 bits = 12 bits left shift because of the multiplications. We carry out shifts after each pass: First (V) pass has BitDepth-9 bit right shift, Second (H) pass has fixed 6 bit right shift. For bit-depth 8, the total shift is 7 bits left. For bit-depth 9, the total shift is 6 bits left. For bit-depth 10, the total shift is 5 bits left. I.e.: a constant residual value 1 gives DC-coeff (1< 4 bits and we are down to 16 bits again. After the second pass, we need 16+5+6=27 bits for the intermediate sum (16 bit input, 5 bit because we sum 2^5 elements, 6 bit because of coefficient multiplication). The second pass shift is Log2(32)+6 = 11 and we are down again to 16 bits. For larger input bit-depths, the intermediate result after the first pass will be wider accordingly, but the widths after the shifts are the same. */ int BitDepth = 8; // / compensate everything | / effective word length | int shift1 = Log2(nT) + 6 + BitDepth - 15; int shift2 = Log2(nT) + 6; int rnd1 = 1<<(shift1-1); int rnd2 = 1<<(shift2-1); int fact = (1<<(5-Log2(nT))); int16_t g[32*32]; // actually, only [nT*nT] used for (int c=0;c>shift1; // clipping to -32768;32767 unnecessary } } for (int y=0;y>shift2; coeffs[y*nT+i] = out; } } } void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) { transform_fdct_8(coeffs, 4, input,stride); } void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) { transform_fdct_8(coeffs, 8, input,stride); } void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) { transform_fdct_8(coeffs, 16, input,stride); } void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) { transform_fdct_8(coeffs, 32, input,stride); } void hadamard_transform_8(int16_t *coeffs, int n, const int16_t *input, ptrdiff_t stride) { int16_t tmp[32*32]; // row transforms //printMatrix("input",input,n); int16_t am[32],bm[32]; int16_t *a = am, *b = bm; for (int row=0;row>1);i++) { a[ i] = input[i+rs] + input[i+(n>>1)+rs]; a[(n>>1)+i] = input[i+rs] - input[i+(n>>1)+rs]; } int iOuter=(n>>1); int nInner=(n>>2); while (nInner>=2) { std::swap(a,b); for (int k=0;k>=1; nInner>>=1; } for (int k=0;k>1);i++) { a[ i] = tmp[i*n+col] + tmp[(i+(n>>1))*n+col]; a[(n>>1)+i] = tmp[i*n+col] - tmp[(i+(n>>1))*n+col]; } int iOuter=(n>>1); int nInner=(n>>2); while (nInner>=2) { std::swap(a,b); for (int k=0;k>=1; nInner>>=1; } for (int k=0;k * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef FALLBACK_DCT_H #define FALLBACK_DCT_H #include #include #include "util.h" // --- decoding --- void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void transform_bypass_fallback(int32_t *r, const int16_t *coeffs, int nT); void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride); void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride); void transform_bypass_rdpcm_v_fallback(int32_t *r, const int16_t *coeffs,int nT); void transform_bypass_rdpcm_h_fallback(int32_t *r, const int16_t *coeffs,int nT); void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth); void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); void rotate_coefficients_fallback(int16_t *coeff, int nT); void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); template void add_residual_fallback(pixel_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) { for (int y=0;y * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "fallback-motion.h" #include "util.h" #if defined(_MSC_VER) || defined(__MINGW32__) # include #elif defined(HAVE_ALLOCA_H) # include #endif #include void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height) { int offset8bit = 32; int shift8bit = 6; assert((width&1)==0); for (int y=0;y>shift8bit); out[1] = Clip1_8bit((in[1] + offset8bit)>>shift8bit); out+=2; in+=2; } } } void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD) { assert(log2WD>=1); // TODO const int rnd = (1<<(log2WD-1)); for (int y=0;y>log2WD) + o); out++; in++; } } } void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD) { assert(log2WD>=1); // TODO const int rnd = static_cast(static_cast(o1+o2+1) << log2WD); for (int y=0;y>(log2WD+1)); out++; in1++; in2++; } } } void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height) { int offset8bit = 64; int shift8bit = 7; assert((width&1)==0); // I had a special case for 8-pixel parallel, unrolled code, // but I did not see any speedup. #if 0 for (int y=0;y>shift8bit); out++; in1++; in2++; } } #endif #if 0 if ((width&7)==0) { for (int y=0;y>shift8bit); out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit); out[2] = Clip1_8bit((in1[2] + in2[2] + offset8bit)>>shift8bit); out[3] = Clip1_8bit((in1[3] + in2[3] + offset8bit)>>shift8bit); out[4] = Clip1_8bit((in1[4] + in2[4] + offset8bit)>>shift8bit); out[5] = Clip1_8bit((in1[5] + in2[5] + offset8bit)>>shift8bit); out[6] = Clip1_8bit((in1[6] + in2[6] + offset8bit)>>shift8bit); out[7] = Clip1_8bit((in1[7] + in2[7] + offset8bit)>>shift8bit); out+=8; in1+=8; in2+=8; } } } else #endif { for (int y=0;y>shift8bit); out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit); out+=2; in1+=2; in2+=2; } } } } void put_unweighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth) { int shift1 = 14-bit_depth; int offset1 = 0; if (shift1>0) { offset1 = 1<<(shift1-1); } assert((width&1)==0); for (int y=0;y>shift1, bit_depth); out[1] = Clip_BitDepth((in[1] + offset1)>>shift1, bit_depth); out+=2; in+=2; } } } #include void put_weighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth) { assert(log2WD>=1); // TODO const int rnd = (1<<(log2WD-1)); for (int y=0;y>log2WD) + o, bit_depth); out++; in++; } } } void put_weighted_bipred_16_fallback(uint16_t *dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { assert(log2WD>=1); // TODO const int rnd = static_cast(static_cast(o1+o2+1) << log2WD); for (int y=0;y>(log2WD+1), bit_depth); out++; in1++; in2++; } } } void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth) { int shift2 = 15-bit_depth; int offset2 = 1<<(shift2-1); assert((width&1)==0); for (int y=0;y>shift2, bit_depth); out[1] = Clip_BitDepth((in1[1] + in2[1] + offset2)>>shift2, bit_depth); out+=2; in1+=2; in2+=2; } } } void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride, const uint8_t *src, ptrdiff_t src_stride, int width, int height, int mx, int my, int16_t* mcbuffer) { int shift3 = 6; for (int y=0;y void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride, const pixel_t *src, ptrdiff_t src_stride, int nPbWC, int nPbHC, int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth) { const int shift1 = bit_depth-8; const int shift2 = 6; //const int shift3 = 6; int extra_left = 1; int extra_top = 1; // int extra_right = 2; int extra_bottom= 2; int nPbH_extra = extra_top + nPbHC + extra_bottom; int16_t* tmp2buf = (int16_t*)alloca( nPbWC * nPbH_extra * sizeof(int16_t) ); /* int nPbW_extra = extra_left + nPbWC + extra_right; printf("x,y FracC: %d/%d\n",xFracC,yFracC); printf("---IN---\n"); for (int y=-extra_top;y>shift1; break; case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break; case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break; case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break; case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break; case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break; default: case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break; } //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v); tmp2buf[y+extra_top + x*nPbH_extra] = v; p++; //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]); } //printf("\n"); } // V-filters int vshift = (xFracC==0 ? shift1 : shift2); for (int x=0;x>vshift; break; case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break; case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break; case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break; case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break; case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break; default: case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break; } dst[x + y*dst_stride] = v; p++; } } /* printf("---V---\n"); for (int y=0;y(int16_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int nPbWC, int nPbHC, int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth); template void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride, const uint16_t *src, ptrdiff_t src_stride, int nPbWC, int nPbHC, int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth); void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride, const uint8_t *src, ptrdiff_t srcstride, int nPbW, int nPbH, int16_t* mcbuffer) { //const int shift1 = 0; // sps->BitDepth_Y-8; const int shift2 = 6; // straight copy for (int y=0;y void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride, const pixel_t *src, ptrdiff_t srcstride, int nPbW, int nPbH, int16_t* mcbuffer, int xFracL, int yFracL, int bit_depth) { int extra_left = extra_before[xFracL]; //int extra_right = extra_after [xFracL]; int extra_top = extra_before[yFracL]; int extra_bottom = extra_after [yFracL]; //int nPbW_extra = extra_left + nPbW + extra_right; int nPbH_extra = extra_top + nPbH + extra_bottom; const int shift1 = bit_depth-8; const int shift2 = 6; // H-filters switch (xFracL) { case 0: for (int y=-extra_top;y>shift1; o += nPbH_extra; p++; } } break; case 2: for (int y=-extra_top;y>shift1; o += nPbH_extra; p++; } } break; case 3: for (int y=-extra_top;y>shift1; o += nPbH_extra; p++; } } break; } logtrace(LogMotion,"---H---\n"); for (int y=-extra_top;y>vshift; o+=out_stride; p++; } } break; case 2: for (int x=0;x>vshift; o+=out_stride; p++; } } break; case 3: for (int x=0;x>vshift; o+=out_stride; p++; } } break; } logtrace(LogMotion,"---V---\n"); for (int y=0;y * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef FALLBACK_MOTION_H #define FALLBACK_MOTION_H #include #include void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height); void put_unweighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height); void put_weighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD); void put_weighted_bipred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD); void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int bit_depth); void put_unweighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int bit_depth); void put_weighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height, int w,int o,int log2WD, int bit_depth); void put_weighted_bipred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height, int w1,int o1, int w2,int o2, int log2WD, int bit_depth); void put_epel_8_fallback(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer); void put_epel_16_fallback(int16_t *out, ptrdiff_t out_stride, const uint16_t *src, ptrdiff_t src_stride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); template void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dststride, const pixel_t *_src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); #define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride, \ const uint8_t *src, ptrdiff_t srcstride, \ int nPbW, int nPbH, int16_t* mcbuffer) QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3); QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3); QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3); QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3); #undef QPEL #define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback_16(int16_t *out, ptrdiff_t out_stride, \ const uint16_t *src, ptrdiff_t srcstride, \ int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3); QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3); QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3); QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3); #undef QPEL #endif libde265-1.0.18/libde265/fallback.cc000066400000000000000000000136621515675107500165230ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "fallback.h" #include "fallback-motion.h" #include "fallback-dct.h" void init_acceleration_functions_fallback(struct acceleration_functions* accel) { accel->put_weighted_pred_avg_8 = put_weighted_pred_avg_8_fallback; accel->put_unweighted_pred_8 = put_unweighted_pred_8_fallback; accel->put_weighted_pred_8 = put_weighted_pred_8_fallback; accel->put_weighted_bipred_8 = put_weighted_bipred_8_fallback; accel->put_weighted_pred_avg_16 = put_weighted_pred_avg_16_fallback; accel->put_unweighted_pred_16 = put_unweighted_pred_16_fallback; accel->put_weighted_pred_16 = put_weighted_pred_16_fallback; accel->put_weighted_bipred_16 = put_weighted_bipred_16_fallback; accel->put_hevc_epel_8 = put_epel_8_fallback; accel->put_hevc_epel_h_8 = put_epel_hv_fallback; accel->put_hevc_epel_v_8 = put_epel_hv_fallback; accel->put_hevc_epel_hv_8 = put_epel_hv_fallback; accel->put_hevc_qpel_8[0][0] = put_qpel_0_0_fallback; accel->put_hevc_qpel_8[0][1] = put_qpel_0_1_fallback; accel->put_hevc_qpel_8[0][2] = put_qpel_0_2_fallback; accel->put_hevc_qpel_8[0][3] = put_qpel_0_3_fallback; accel->put_hevc_qpel_8[1][0] = put_qpel_1_0_fallback; accel->put_hevc_qpel_8[1][1] = put_qpel_1_1_fallback; accel->put_hevc_qpel_8[1][2] = put_qpel_1_2_fallback; accel->put_hevc_qpel_8[1][3] = put_qpel_1_3_fallback; accel->put_hevc_qpel_8[2][0] = put_qpel_2_0_fallback; accel->put_hevc_qpel_8[2][1] = put_qpel_2_1_fallback; accel->put_hevc_qpel_8[2][2] = put_qpel_2_2_fallback; accel->put_hevc_qpel_8[2][3] = put_qpel_2_3_fallback; accel->put_hevc_qpel_8[3][0] = put_qpel_3_0_fallback; accel->put_hevc_qpel_8[3][1] = put_qpel_3_1_fallback; accel->put_hevc_qpel_8[3][2] = put_qpel_3_2_fallback; accel->put_hevc_qpel_8[3][3] = put_qpel_3_3_fallback; accel->put_hevc_epel_16 = put_epel_16_fallback; accel->put_hevc_epel_h_16 = put_epel_hv_fallback; accel->put_hevc_epel_v_16 = put_epel_hv_fallback; accel->put_hevc_epel_hv_16 = put_epel_hv_fallback; accel->put_hevc_qpel_16[0][0] = put_qpel_0_0_fallback_16; accel->put_hevc_qpel_16[0][1] = put_qpel_0_1_fallback_16; accel->put_hevc_qpel_16[0][2] = put_qpel_0_2_fallback_16; accel->put_hevc_qpel_16[0][3] = put_qpel_0_3_fallback_16; accel->put_hevc_qpel_16[1][0] = put_qpel_1_0_fallback_16; accel->put_hevc_qpel_16[1][1] = put_qpel_1_1_fallback_16; accel->put_hevc_qpel_16[1][2] = put_qpel_1_2_fallback_16; accel->put_hevc_qpel_16[1][3] = put_qpel_1_3_fallback_16; accel->put_hevc_qpel_16[2][0] = put_qpel_2_0_fallback_16; accel->put_hevc_qpel_16[2][1] = put_qpel_2_1_fallback_16; accel->put_hevc_qpel_16[2][2] = put_qpel_2_2_fallback_16; accel->put_hevc_qpel_16[2][3] = put_qpel_2_3_fallback_16; accel->put_hevc_qpel_16[3][0] = put_qpel_3_0_fallback_16; accel->put_hevc_qpel_16[3][1] = put_qpel_3_1_fallback_16; accel->put_hevc_qpel_16[3][2] = put_qpel_3_2_fallback_16; accel->put_hevc_qpel_16[3][3] = put_qpel_3_3_fallback_16; accel->transform_skip_8 = transform_skip_8_fallback; accel->transform_skip_rdpcm_h_8 = transform_skip_rdpcm_h_8_fallback; accel->transform_skip_rdpcm_v_8 = transform_skip_rdpcm_v_8_fallback; accel->transform_bypass = transform_bypass_fallback; accel->transform_bypass_rdpcm_h = transform_bypass_rdpcm_h_fallback; accel->transform_bypass_rdpcm_v = transform_bypass_rdpcm_v_fallback; accel->transform_4x4_dst_add_8 = transform_4x4_luma_add_8_fallback; accel->transform_add_8[0] = transform_4x4_add_8_fallback; accel->transform_add_8[1] = transform_8x8_add_8_fallback; accel->transform_add_8[2] = transform_16x16_add_8_fallback; accel->transform_add_8[3] = transform_32x32_add_8_fallback; accel->transform_skip_16 = transform_skip_16_fallback; accel->transform_4x4_dst_add_16 = transform_4x4_luma_add_16_fallback; accel->transform_add_16[0] = transform_4x4_add_16_fallback; accel->transform_add_16[1] = transform_8x8_add_16_fallback; accel->transform_add_16[2] = transform_16x16_add_16_fallback; accel->transform_add_16[3] = transform_32x32_add_16_fallback; accel->rotate_coefficients = rotate_coefficients_fallback; accel->add_residual_8 = add_residual_fallback; accel->add_residual_16 = add_residual_fallback; accel->rdpcm_h = rdpcm_h_fallback; accel->rdpcm_v = rdpcm_v_fallback; accel->transform_skip_residual = transform_skip_residual_fallback; accel->transform_idst_4x4 = transform_idst_4x4_fallback; accel->transform_idct_4x4 = transform_idct_4x4_fallback; accel->transform_idct_8x8 = transform_idct_8x8_fallback; accel->transform_idct_16x16 = transform_idct_16x16_fallback; accel->transform_idct_32x32 = transform_idct_32x32_fallback; accel->fwd_transform_4x4_dst_8 = fdst_4x4_8_fallback; accel->fwd_transform_8[0] = fdct_4x4_8_fallback; accel->fwd_transform_8[1] = fdct_8x8_8_fallback; accel->fwd_transform_8[2] = fdct_16x16_8_fallback; accel->fwd_transform_8[3] = fdct_32x32_8_fallback; accel->hadamard_transform_8[0] = hadamard_4x4_8_fallback; accel->hadamard_transform_8[1] = hadamard_8x8_8_fallback; accel->hadamard_transform_8[2] = hadamard_16x16_8_fallback; accel->hadamard_transform_8[3] = hadamard_32x32_8_fallback; } libde265-1.0.18/libde265/fallback.h000066400000000000000000000017061515675107500163610ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_FALLBACK_H #define DE265_FALLBACK_H #include "acceleration.h" void init_acceleration_functions_fallback(struct acceleration_functions* lowlevel); #endif libde265-1.0.18/libde265/image-io.cc000066400000000000000000000105521515675107500164460ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * Authors: struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "libde265/image-io.h" #include ImageSource::ImageSource() { } ImageSource::~ImageSource() { } ImageSource_YUV::ImageSource_YUV() : mFH(nullptr) { } ImageSource_YUV::~ImageSource_YUV() { if (mFH) { fclose(mFH); } } bool ImageSource_YUV::set_input_file(const char* filename, int w,int h) { assert(mFH==nullptr); mFH = fopen(filename,"rb"); if (mFH==nullptr) { return false; } width =w; height=h; mReachedEndOfFile = false; return true; } de265_image* ImageSource_YUV::read_next_image() { if (mReachedEndOfFile) return nullptr; de265_image* img = new de265_image; img->alloc_image(width,height,de265_chroma_420, nullptr, false, nullptr, /*nullptr,*/ 0, nullptr, false); assert(img); // TODO: error handling // --- load image --- uint8_t* p; int stride; p = img->get_image_plane(0); stride = img->get_image_stride(0); for (uint32_t y=0;yget_image_plane(1); stride = img->get_image_stride(1); for (uint32_t y=0;yget_image_plane(2); stride = img->get_image_stride(2); for (uint32_t y=0;yget_width(); int height= img->get_height(); p = img->get_image_plane(0); stride = img->get_image_stride(0); for (int y=0;yget_image_plane(1); stride = img->get_image_stride(1); for (int y=0;yget_image_plane(2); stride = img->get_image_stride(2); for (int y=0;y * * Authors: struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef IMAGE_IO_H #define IMAGE_IO_H #include "libde265/image.h" #include class ImageSource { public: LIBDE265_API ImageSource(); virtual LIBDE265_API ~ImageSource(); //enum ImageStatus { Available, Waiting, EndOfVideo }; //virtual ImageStatus get_status() = 0; virtual LIBDE265_API de265_image* get_image(bool block=true) = 0; virtual LIBDE265_API void skip_frames(int n) = 0; virtual LIBDE265_API int get_width() const = 0; virtual LIBDE265_API int get_height() const = 0; }; class ImageSource_YUV : public ImageSource { public: LIBDE265_API ImageSource_YUV(); virtual LIBDE265_API ~ImageSource_YUV(); bool LIBDE265_API set_input_file(const char* filename, int w,int h); //virtual ImageStatus get_status(); virtual LIBDE265_API de265_image* get_image(bool block=true); virtual LIBDE265_API void skip_frames(int n); virtual LIBDE265_API int get_width() const; virtual LIBDE265_API int get_height() const; private: FILE* mFH; bool mReachedEndOfFile; uint32_t width,height; de265_image* read_next_image(); }; class ImageSink { public: virtual LIBDE265_API ~ImageSink(); virtual LIBDE265_API void send_image(const de265_image* img) = 0; }; class ImageSink_YUV : public ImageSink { public: LIBDE265_API ImageSink_YUV(); LIBDE265_API ~ImageSink_YUV(); bool LIBDE265_API set_filename(const char* filename); virtual LIBDE265_API void send_image(const de265_image* img); private: FILE* mFH; }; class PacketSink { public: virtual LIBDE265_API ~PacketSink(); virtual LIBDE265_API void send_packet(const uint8_t* data, int n) = 0; }; class PacketSink_File : public PacketSink { public: LIBDE265_API PacketSink_File(); virtual LIBDE265_API ~PacketSink_File(); LIBDE265_API void set_filename(const char* filename); virtual LIBDE265_API void send_packet(const uint8_t* data, int n); private: FILE* mFH; }; #endif libde265-1.0.18/libde265/image.cc000066400000000000000000000542601515675107500160450ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "image.h" #include "decctx.h" #include #include #include #include #include #ifdef HAVE_MALLOC_H #include #endif #ifdef HAVE_SSE4_1 // SSE code processes 128bit per iteration and thus might read more data // than is later actually used. #define MEMORY_PADDING 16 #else #define MEMORY_PADDING 0 #endif #define STANDARD_ALIGNMENT 16 #ifdef HAVE___MINGW_ALIGNED_MALLOC #define ALLOC_ALIGNED(alignment, size) __mingw_aligned_malloc((size), (alignment)) #define FREE_ALIGNED(mem) __mingw_aligned_free((mem)) #elif _WIN32 #define ALLOC_ALIGNED(alignment, size) _aligned_malloc((size), (alignment)) #define FREE_ALIGNED(mem) _aligned_free((mem)) #elif defined(HAVE_POSIX_MEMALIGN) static inline void *ALLOC_ALIGNED(size_t alignment, size_t size) { void *mem = nullptr; if (posix_memalign(&mem, alignment, size) != 0) { return nullptr; } return mem; }; #define FREE_ALIGNED(mem) free((mem)) #else #define ALLOC_ALIGNED(alignment, size) memalign((alignment), (size)) #define FREE_ALIGNED(mem) free((mem)) #endif #define ALLOC_ALIGNED_16(size) ALLOC_ALIGNED(16, size) LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx, void* inputdata, int inputstride, void *userdata) { int alignment = STANDARD_ALIGNMENT; int stride = (img->get_width(cIdx) + alignment-1) / alignment * alignment; int height = img->get_height(cIdx); uint8_t* p = static_cast(ALLOC_ALIGNED_16(stride * height + MEMORY_PADDING)); if (p==nullptr) { return nullptr; } img->set_image_plane(cIdx, p, stride, userdata); // copy input data if provided if (inputdata != nullptr) { if (inputstride == stride) { memcpy(p, inputdata, stride*height); } else { for (int y=0;y(inputdata) + inputstride*y, inputstride); } } } return p; } LIBDE265_API void de265_free_image_plane(struct de265_image* img, int cIdx) { uint8_t* p = img->get_image_plane(cIdx); assert(p); FREE_ALIGNED(p); } static int de265_image_get_buffer(de265_decoder_context* ctx, de265_image_spec* spec, de265_image* img, void* userdata) { const int rawChromaWidth = spec->width / img->SubWidthC; const int rawChromaHeight = spec->height / img->SubHeightC; int luma_stride = (spec->width + spec->alignment-1) / spec->alignment * spec->alignment; int chroma_stride = (rawChromaWidth + spec->alignment-1) / spec->alignment * spec->alignment; assert(img->BitDepth_Y >= 8 && img->BitDepth_Y <= 16); assert(img->BitDepth_C >= 8 && img->BitDepth_C <= 16); int luma_bpl = luma_stride * ((img->BitDepth_Y+7)/8); int chroma_bpl = chroma_stride * ((img->BitDepth_C+7)/8); int luma_height = spec->height; int chroma_height = rawChromaHeight; bool alloc_failed = false; uint8_t* p[3] = { 0,0,0 }; p[0] = static_cast(ALLOC_ALIGNED_16(luma_height * luma_bpl + MEMORY_PADDING)); if (p[0]==nullptr) { alloc_failed=true; } if (img->get_chroma_format() != de265_chroma_mono) { p[1] = static_cast(ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING)); p[2] = static_cast(ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING)); if (p[1]==nullptr || p[2]==nullptr) { alloc_failed=true; } } else { p[1] = nullptr; p[2] = nullptr; chroma_stride = 0; } if (alloc_failed) { for (int i=0;i<3;i++) if (p[i]) { FREE_ALIGNED(p[i]); } return 0; } img->set_image_plane(0, p[0], luma_stride, nullptr); img->set_image_plane(1, p[1], chroma_stride, nullptr); img->set_image_plane(2, p[2], chroma_stride, nullptr); img->fill_image(0,0,0); return 1; } static void de265_image_release_buffer(de265_decoder_context* ctx, de265_image* img, void* userdata) { for (int i=0;i<3;i++) { uint8_t* p = img->get_image_plane(i); if (p) { FREE_ALIGNED(p); } } } de265_image_allocation de265_image::default_image_allocation = { de265_image_get_buffer, de265_image_release_buffer }; void de265_image::set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata) { pixels[cIdx] = mem; plane_user_data[cIdx] = userdata; if (cIdx==0) { this->stride = stride; } else { this->chroma_stride = stride; } } de265_image::de265_image() = default; de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c, std::shared_ptr sps, bool allocMetadata, decoder_context* dctx, //encoder_context* ectx, de265_PTS pts, void* user_data, bool useCustomAllocFunc) { //if (allocMetadata) { assert(sps); } if (allocMetadata) { assert(sps); } if (sps) { this->sps = sps; } release(); /* TODO: review code for efficient allocation when arrays are already allocated to the requested size. Without the release, the old image-data will not be freed. */ static std::atomic s_next_image_ID(0); ID = s_next_image_ID++; removed_at_picture_id = std::numeric_limits::max(); decctx = dctx; //encctx = ectx; // --- allocate image buffer --- chroma_format= c; width = w; height = h; chroma_width = w; chroma_height= h; this->user_data = user_data; this->pts = pts; de265_image_spec spec; uint8_t WinUnitX, WinUnitY; switch (chroma_format) { case de265_chroma_mono: WinUnitX=1; WinUnitY=1; break; case de265_chroma_420: WinUnitX=2; WinUnitY=2; break; case de265_chroma_422: WinUnitX=2; WinUnitY=1; break; case de265_chroma_444: WinUnitX=1; WinUnitY=1; break; default: assert(0); WinUnitX = WinUnitY = 0; } switch (chroma_format) { case de265_chroma_420: spec.format = de265_image_format_YUV420P8; chroma_width = (chroma_width +1)/2; chroma_height = (chroma_height+1)/2; SubWidthC = 2; SubHeightC = 2; break; case de265_chroma_422: spec.format = de265_image_format_YUV422P8; chroma_width = (chroma_width+1)/2; SubWidthC = 2; SubHeightC = 1; break; case de265_chroma_444: spec.format = de265_image_format_YUV444P8; SubWidthC = 1; SubHeightC = 1; break; case de265_chroma_mono: spec.format = de265_image_format_mono8; chroma_width = 0; chroma_height= 0; SubWidthC = 1; SubHeightC = 1; break; default: assert(false); break; } if (chroma_format != de265_chroma_mono && sps) { assert(sps->SubWidthC == SubWidthC); assert(sps->SubHeightC == SubHeightC); } spec.width = w; spec.height = h; spec.alignment = STANDARD_ALIGNMENT; // conformance window cropping int left = sps ? sps->conf_win_left_offset : 0; int right = sps ? sps->conf_win_right_offset : 0; int top = sps ? sps->conf_win_top_offset : 0; int bottom = sps ? sps->conf_win_bottom_offset : 0; if ((left+right)*WinUnitX >= width) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if ((top+bottom)*WinUnitY >= height) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } width_confwin = width - (left+right)*WinUnitX; height_confwin= height- (top+bottom)*WinUnitY; chroma_width_confwin = chroma_width -left-right; chroma_height_confwin= chroma_height-top-bottom; spec.crop_left = left *WinUnitX; spec.crop_right = right*WinUnitX; spec.crop_top = top *WinUnitY; spec.crop_bottom= bottom*WinUnitY; spec.visible_width = width_confwin; spec.visible_height= height_confwin; BitDepth_Y = (sps==nullptr) ? 8 : sps->BitDepth_Y; BitDepth_C = (sps==nullptr) ? 8 : sps->BitDepth_C; bpp_shift[0] = (BitDepth_Y <= 8) ? 0 : 1; bpp_shift[1] = (BitDepth_C <= 8) ? 0 : 1; bpp_shift[2] = bpp_shift[1]; // allocate memory and set conformance window pointers void* alloc_userdata = nullptr; if (decctx) alloc_userdata = decctx->param_image_allocation_userdata; // if (encctx) alloc_userdata = encctx->param_image_allocation_userdata; // actually not needed /* if (encctx && useCustomAllocFunc) { encoder_image_release_func = encctx->release_func; // if we do not provide a release function, use our own if (encoder_image_release_func == nullptr) { image_allocation_functions = de265_image::default_image_allocation; } else { image_allocation_functions.get_buffer = nullptr; image_allocation_functions.release_buffer = nullptr; } } else*/ if (decctx && useCustomAllocFunc) { image_allocation_functions = decctx->param_image_allocation_functions; } else { image_allocation_functions = de265_image::default_image_allocation; } bool mem_alloc_success = true; if (image_allocation_functions.get_buffer != nullptr) { mem_alloc_success = image_allocation_functions.get_buffer(decctx, &spec, this, alloc_userdata); pixels_confwin[0] = pixels[0] + left*WinUnitX + top*WinUnitY*stride; if (chroma_format != de265_chroma_mono) { pixels_confwin[1] = pixels[1] + left + top*chroma_stride; pixels_confwin[2] = pixels[2] + left + top*chroma_stride; } else { pixels_confwin[1] = nullptr; pixels_confwin[2] = nullptr; } // check for memory shortage if (!mem_alloc_success) { return DE265_ERROR_OUT_OF_MEMORY; } } //alloc_functions = *allocfunc; //alloc_userdata = userdata; // --- allocate decoding info arrays --- if (allocMetadata) { // intra pred mode mem_alloc_success &= intraPredMode.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs, sps->Log2MinPUSize); mem_alloc_success &= intraPredModeC.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs, sps->Log2MinPUSize); // cb info mem_alloc_success &= cb_info.alloc(sps->PicWidthInMinCbsY, sps->PicHeightInMinCbsY, sps->Log2MinCbSizeY); // pb info int puWidth = sps->PicWidthInMinCbsY << (sps->Log2MinCbSizeY -2); int puHeight = sps->PicHeightInMinCbsY << (sps->Log2MinCbSizeY -2); mem_alloc_success &= pb_info.alloc(puWidth,puHeight, 2); // tu info mem_alloc_success &= tu_info.alloc(sps->PicWidthInTbsY, sps->PicHeightInTbsY, sps->Log2MinTrafoSize); // deblk info int deblk_w = (sps->pic_width_in_luma_samples +3)/4; int deblk_h = (sps->pic_height_in_luma_samples+3)/4; mem_alloc_success &= deblk_info.alloc(deblk_w, deblk_h, 2); // CTB info if (ctb_info.width_in_units != sps->PicWidthInCtbsY || ctb_info.height_in_units != sps->PicHeightInCtbsY || ctb_info.log2unitSize != sps->Log2CtbSizeY) { delete[] ctb_progress; mem_alloc_success &= ctb_info.alloc(sps->PicWidthInCtbsY, sps->PicHeightInCtbsY, sps->Log2CtbSizeY); ctb_progress = new de265_progress_lock[ ctb_info.data_size ]; } // check for memory shortage if (!mem_alloc_success) { return DE265_ERROR_OUT_OF_MEMORY; } } return DE265_OK; } de265_image::~de265_image() { release(); // free progress locks if (ctb_progress) { delete[] ctb_progress; } } void de265_image::release() { // free image memory if (pixels[0]) { /* if (encoder_image_release_func != nullptr) { encoder_image_release_func(encctx, this, encctx->param_image_allocation_userdata); } else*/ { image_allocation_functions.release_buffer(decctx, this, decctx ? decctx->param_image_allocation_userdata : nullptr); } for (int i=0;i<3;i++) { pixels[i] = nullptr; pixels_confwin[i] = nullptr; } } // free slices for (size_t i=0;i= 0); // needed for the shift operation in the check below if (bytes_per_pixel == 1) { if (channel==0) { memset(pixels[channel], value, stride * height); } else { memset(pixels[channel], value, chroma_stride * chroma_height); } } else if ((value >> 8) == (value & 0xFF)) { assert(bytes_per_pixel == 2); // if we fill the same byte value to all bytes, we can still use memset() if (channel==0) { memset(pixels[channel], 0, stride * height * bytes_per_pixel); } else { memset(pixels[channel], 0, chroma_stride * chroma_height * bytes_per_pixel); } } else { assert(bytes_per_pixel == 2); uint16_t v = value; if (channel==0) { // copy value into first row for (int x = 0; x < width; x++) { *reinterpret_cast(&pixels[channel][2 * x]) = v; } // copy first row into remaining rows for (int y = 1; y < height; y++) { memcpy(pixels[channel] + y * stride * 2, pixels[channel], chroma_width * 2); } } else { // copy value into first row for (int x = 0; x < chroma_width; x++) { *reinterpret_cast(&pixels[channel][2 * x]) = v; } // copy first row into remaining rows for (int y = 1; y < chroma_height; y++) { memcpy(pixels[channel] + y * chroma_stride * 2, pixels[channel], chroma_width * 2); } } } } void de265_image::fill_image(int y,int cb,int cr) { if (pixels[0]) { fill_plane(0, y); } if (pixels[1]) { fill_plane(1, cb); } if (pixels[2]) { fill_plane(2, cr); } } de265_error de265_image::copy_image(const de265_image* src) { /* TODO: actually, since we allocate the image only for internal purpose, we do not have to call the external allocation routines for this. However, then we have to track for each image how to release it again. Another option would be to safe the copied data not in an de265_image at all. */ de265_error err = alloc_image(src->width, src->height, src->chroma_format, src->sps, false, src->decctx, /*src->encctx,*/ src->pts, src->user_data, false); if (err != DE265_OK) { return err; } copy_lines_from(src, 0, src->height); return err; } // end = last line + 1 void de265_image::copy_lines_from(const de265_image* src, int first, int end) { if (end > src->height) end=src->height; assert(first % 2 == 0); assert(end % 2 == 0); int luma_bpp = (sps->BitDepth_Y+7)/8; int chroma_bpp = (sps->BitDepth_C+7)/8; if (src->stride == stride) { memcpy(pixels[0] + first*stride * luma_bpp, src->pixels[0] + first*src->stride * luma_bpp, (end-first)*stride * luma_bpp); } else { for (int yp=first;yppixels[0]+yp*src->stride * luma_bpp, src->width * luma_bpp); } } int first_chroma = first / src->SubHeightC; int end_chroma = end / src->SubHeightC; if (src->chroma_format != de265_chroma_mono) { if (src->chroma_stride == chroma_stride) { memcpy(pixels[1] + first_chroma*chroma_stride * chroma_bpp, src->pixels[1] + first_chroma*chroma_stride * chroma_bpp, (end_chroma-first_chroma) * chroma_stride * chroma_bpp); memcpy(pixels[2] + first_chroma*chroma_stride * chroma_bpp, src->pixels[2] + first_chroma*chroma_stride * chroma_bpp, (end_chroma-first_chroma) * chroma_stride * chroma_bpp); } else { for (int y=first_chroma;ypixels[1]+y*src->chroma_stride * chroma_bpp, src->chroma_width * chroma_bpp); memcpy(pixels[2]+y*chroma_stride * chroma_bpp, src->pixels[2]+y*src->chroma_stride * chroma_bpp, src->chroma_width * chroma_bpp); } } } } void de265_image::exchange_pixel_data_with(de265_image& b) { for (int i=0;i<3;i++) { std::swap(pixels[i], b.pixels[i]); std::swap(pixels_confwin[i], b.pixels_confwin[i]); std::swap(plane_user_data[i], b.plane_user_data[i]); } std::swap(stride, b.stride); std::swap(chroma_stride, b.chroma_stride); std::swap(image_allocation_functions, b.image_allocation_functions); } void de265_image::thread_start(int nThreads) { std::unique_lock lock(mutex); //printf("nThreads before: %d %d\n",nThreadsQueued, nThreadsTotal); nThreadsQueued += nThreads; nThreadsTotal += nThreads; //printf("nThreads after: %d %d\n",nThreadsQueued, nThreadsTotal); } void de265_image::thread_run(const thread_task* task) { std::unique_lock lock(mutex); //printf("run thread %s\n", task->name().c_str()); nThreadsQueued--; nThreadsRunning++; } void de265_image::thread_blocks() { std::unique_lock lock(mutex); nThreadsRunning--; nThreadsBlocked++; } void de265_image::thread_unblocks() { std::unique_lock lock(mutex); nThreadsBlocked--; nThreadsRunning++; } void de265_image::thread_finishes(const thread_task* task) { //printf("finish thread %s\n", task->name().c_str()); std::unique_lock lock(mutex); nThreadsRunning--; nThreadsFinished++; assert(nThreadsRunning >= 0); if (nThreadsFinished==nThreadsTotal) { finished_cond.notify_all(); } } void de265_image::wait_for_progress(thread_task* task, int ctbx,int ctby, int progress) { const int ctbW = sps->PicWidthInCtbsY; wait_for_progress(task, ctbx + ctbW*ctby, progress); } void de265_image::wait_for_progress(thread_task* task, int ctbAddrRS, int progress) { if (task==nullptr) { return; } de265_progress_lock* progresslock = &ctb_progress[ctbAddrRS]; if (progresslock->get_progress() < progress) { thread_blocks(); assert(task!=nullptr); task->state = thread_task::Blocked; /* TODO: check whether we are the first blocked task in the list. If we are, we have to conceal input errors. Simplest concealment: do not block. */ progresslock->wait_for_progress(progress); task->state = thread_task::Running; thread_unblocks(); } } void de265_image::wait_for_completion() { std::unique_lock lock(mutex); while (nThreadsFinished!=nThreadsTotal) { finished_cond.wait(lock); } } bool de265_image::debug_is_completed() const { return nThreadsFinished==nThreadsTotal; } void de265_image::clear_metadata() { // TODO: maybe we could avoid the memset by ensuring that all data is written to // during decoding (especially log2CbSize), but it is unlikely to be faster than the memset. cb_info.clear(); intraPredMode.clear(); //tu_info.clear(); // done on the fly ctb_info.clear(); deblk_info.clear(); // --- reset CTB progresses --- for (int i=0;i> log2PuSize; int yPu = y >> log2PuSize; int wPu = nPbW >> log2PuSize; int hPu = nPbH >> log2PuSize; int stride = pb_info.width_in_units; for (int pby=0;pby=sps->pic_width_in_luma_samples || yN>=sps->pic_height_in_luma_samples) return false; int minBlockAddrN = pps->MinTbAddrZS[ (xN>>sps->Log2MinTrafoSize) + (yN>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; int minBlockAddrCurr = pps->MinTbAddrZS[ (xCurr>>sps->Log2MinTrafoSize) + (yCurr>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; if (minBlockAddrN > minBlockAddrCurr) return false; int xCurrCtb = xCurr >> sps->Log2CtbSizeY; int yCurrCtb = yCurr >> sps->Log2CtbSizeY; int xNCtb = xN >> sps->Log2CtbSizeY; int yNCtb = yN >> sps->Log2CtbSizeY; if (get_SliceAddrRS(xCurrCtb,yCurrCtb) != get_SliceAddrRS(xNCtb, yNCtb)) { return false; } if (pps->TileIdRS[xCurrCtb + yCurrCtb*sps->PicWidthInCtbsY] != pps->TileIdRS[xNCtb + yNCtb *sps->PicWidthInCtbsY]) { return false; } return true; } bool de265_image::available_pred_blk(int xC,int yC, int nCbS, int xP, int yP, int nPbW, int nPbH, int partIdx, int xN,int yN) const { logtrace(LogMotion,"C:%d;%d P:%d;%d N:%d;%d size=%d;%d\n",xC,yC,xP,yP,xN,yN,nPbW,nPbH); int sameCb = (xC <= xN && xN < xC+nCbS && yC <= yN && yN < yC+nCbS); bool availableN; if (!sameCb) { availableN = available_zscan(xP,yP,xN,yN); } else { availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS && // NxN partIdx==1 && yN >= yC+nPbH && xN < xC+nPbW); // xN/yN inside partIdx 2 } if (availableN && get_pred_mode(xN,yN) == MODE_INTRA) { availableN = false; } return availableN; } libde265-1.0.18/libde265/image.h000066400000000000000000000605311515675107500157050ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_IMAGE_H #define DE265_IMAGE_H #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include "libde265/de265.h" #include "libde265/sps.h" #include "libde265/pps.h" #include "libde265/motion.h" #include "libde265/threads.h" #include "libde265/slice.h" #include "libde265/nal.h" struct en265_encoder_context; enum PictureState { UnusedForReference, UsedForShortTermReference, UsedForLongTermReference }; /* TODO: At INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE images, we can check the SEI hash, whether the output image is correct despite the faulty reference, and set the state back to correct. */ constexpr uint8_t INTEGRITY_CORRECT = 0; constexpr uint8_t INTEGRITY_UNAVAILABLE_REFERENCE = 1; constexpr uint8_t INTEGRITY_NOT_DECODED = 2; constexpr uint8_t INTEGRITY_DECODING_ERRORS = 3; constexpr uint8_t INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE = 4; constexpr uint8_t SEI_HASH_UNCHECKED = 0; constexpr uint8_t SEI_HASH_CORRECT = 1; constexpr uint8_t SEI_HASH_INCORRECT = 2; constexpr uint8_t TU_FLAG_NONZERO_COEFF = (1<<7); constexpr uint8_t TU_FLAG_SPLIT_TRANSFORM_MASK = 0x1F; constexpr uint8_t DEBLOCK_FLAG_VERTI = (1<<4); constexpr uint8_t DEBLOCK_FLAG_HORIZ = (1<<5); constexpr uint8_t DEBLOCK_PB_EDGE_VERTI = (1<<6); constexpr uint8_t DEBLOCK_PB_EDGE_HORIZ = (1<<7); constexpr uint8_t DEBLOCK_BS_MASK = 0x03; constexpr int CTB_PROGRESS_NONE = 0; constexpr int CTB_PROGRESS_PREFILTER = 1; constexpr int CTB_PROGRESS_DEBLK_V = 2; constexpr int CTB_PROGRESS_DEBLK_H = 3; constexpr int CTB_PROGRESS_SAO = 4; class decoder_context; template class MetaDataArray { public: MetaDataArray() = default; ~MetaDataArray() { free(data); } LIBDE265_CHECK_RESULT bool alloc(int w,int h, uint8_t _log2unitSize) { int size = w*h; if (size != data_size) { free(data); data = (DataUnit*)calloc(size, sizeof(DataUnit)); if (data == nullptr) { data_size = 0; return false; } data_size = size; } width_in_units = w; height_in_units = h; log2unitSize = _log2unitSize; return data != nullptr; } void clear() { if (data) memset(data, 0, sizeof(DataUnit) * data_size); } const DataUnit& get(int x,int y) const { int unitX = x>>log2unitSize; int unitY = y>>log2unitSize; assert(unitX >= 0 && unitX < width_in_units); assert(unitY >= 0 && unitY < height_in_units); return data[ unitX + unitY*width_in_units ]; } DataUnit& get(int x,int y) { int unitX = x>>log2unitSize; int unitY = y>>log2unitSize; assert(unitX >= 0 && unitX < width_in_units); assert(unitY >= 0 && unitY < height_in_units); return data[ unitX + unitY*width_in_units ]; } void set(int x,int y, const DataUnit& d) { int unitX = x>>log2unitSize; int unitY = y>>log2unitSize; assert(unitX >= 0 && unitX < width_in_units); assert(unitY >= 0 && unitY < height_in_units); data[ unitX + unitY*width_in_units ] = d; } DataUnit& operator[](int idx) { return data[idx]; } const DataUnit& operator[](int idx) const { return data[idx]; } int size() const { return data_size; } // private: DataUnit* data = nullptr; int data_size = 0; uint8_t log2unitSize = 0; int width_in_units = 0; int height_in_units = 0; }; struct CTB_info { uint16_t SliceAddrRS; uint16_t SliceHeaderIndex; // index into array to slice header for this CTB sao_info saoInfo; bool deblock; // this CTB has to be deblocked // The following flag helps to quickly check whether we have to // check all conditions in the SAO filter or whether we can skip them. bool has_pcm_or_cu_transquant_bypass; // pcm or transquant_bypass is used in this CTB }; struct CB_ref_info { uint8_t log2CbSize : 3; /* [0;6] (1< sps, bool allocMetadata, decoder_context* dctx, //class encoder_context* ectx, de265_PTS pts, void* user_data, bool useCustomAllocFunctions); //de265_error alloc_encoder_data(const seq_parameter_set* sps); bool is_allocated() const { return pixels[0] != nullptr; } void release(); void set_headers(std::shared_ptr _vps, std::shared_ptr _sps, std::shared_ptr _pps) { vps = _vps; sps = _sps; pps = _pps; } void fill_image(int y,int u,int v); void fill_plane(int channel, int value); de265_error copy_image(const de265_image* src); void copy_lines_from(const de265_image* src, int first, int end); void exchange_pixel_data_with(de265_image&); uint32_t get_ID() const { return ID; } /* */ uint8_t* get_image_plane(int cIdx) { return pixels[cIdx]; } const uint8_t* get_image_plane(int cIdx) const { return pixels[cIdx]; } void set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata); uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) { int stride = get_image_stride(cIdx); return pixels[cIdx] + xpos + ypos*stride; } /// xpos;ypos in actual plane resolution template pixel_t* get_image_plane_at_pos_NEW(int cIdx, int xpos,int ypos) { int stride = get_image_stride(cIdx); return (pixel_t*)(pixels[cIdx] + (xpos + ypos*stride)*sizeof(pixel_t)); } const uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) const { int stride = get_image_stride(cIdx); return pixels[cIdx] + xpos + ypos*stride; } void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) { int stride = get_image_stride(cIdx); return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]); } const void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) const { int stride = get_image_stride(cIdx); return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]); } /* Number of pixels in one row (not number of bytes). */ int get_image_stride(int cIdx) const { if (cIdx==0) return stride; else return chroma_stride; } int get_luma_stride() const { return stride; } int get_chroma_stride() const { return chroma_stride; } int get_width (int cIdx=0) const { return cIdx==0 ? width : chroma_width; } int get_height(int cIdx=0) const { return cIdx==0 ? height : chroma_height; } de265_chroma get_chroma_format() const { return chroma_format; } int get_bit_depth(int cIdx) const { if (cIdx==0) return sps->BitDepth_Y; else return sps->BitDepth_C; } int get_bytes_per_pixel(int cIdx) const { return (get_bit_depth(cIdx)+7)/8; } bool high_bit_depth(int cIdx) const { return get_bit_depth(cIdx)>8; } bool can_be_released() const { return PicOutputFlag==false && PicState==UnusedForReference; } void add_slice_segment_header(slice_segment_header* shdr) { shdr->slice_index = slices.size(); slices.push_back(shdr); } bool available_zscan(int xCurr,int yCurr, int xN,int yN) const; bool available_pred_blk(int xC,int yC, int nCbS, int xP, int yP, int nPbW, int nPbH, int partIdx, int xN,int yN) const; static de265_image_allocation default_image_allocation; void printBlk(const char* title, int x0,int y0,int blkSize,int cIdx) const { ::printBlk(title, get_image_plane_at_pos(cIdx,x0,y0), blkSize, get_image_stride(cIdx)); } private: uint32_t ID = std::numeric_limits::max(); uint8_t* pixels[3] = { nullptr, nullptr, nullptr }; uint8_t bpp_shift[3] = {}; // 0 for 8 bit, 1 for 16 bit de265_chroma chroma_format = de265_chroma_mono; int width = 0, height = 0; // size in luma pixels int chroma_width = 0, chroma_height = 0; int stride = 0, chroma_stride = 0; public: uint8_t BitDepth_Y = 0, BitDepth_C = 0; uint8_t SubWidthC = 0, SubHeightC = 0; std::vector slices; public: // --- conformance cropping window --- uint8_t* pixels_confwin[3] = { nullptr, nullptr, nullptr }; int width_confwin = 0, height_confwin = 0; int chroma_width_confwin = 0, chroma_height_confwin = 0; // --- decoding info --- // If PicOutputFlag==false && PicState==UnusedForReference, image buffer is free. int picture_order_cnt_lsb = -1; // undefined int PicOrderCntVal = -1; // undefined PictureState PicState = UnusedForReference; bool PicOutputFlag = false; uint32_t removed_at_picture_id = 0; // picture not used, so we can assume it has been removed const video_parameter_set& get_vps() const { return *vps; } const seq_parameter_set& get_sps() const { return *sps; } const pic_parameter_set& get_pps() const { return *pps; } bool has_vps() const { return vps != nullptr; } bool has_sps() const { return sps != nullptr; } bool has_pps() const { return pps != nullptr; } std::shared_ptr get_shared_sps() { return sps; } //std::shared_ptr get_shared_sps() const { return sps; } //std::shared_ptr get_shared_pps() const { return pps; } decoder_context* decctx = nullptr; [[nodiscard]] uint32_t number_of_ctbs() const { return static_cast(ctb_info.size()); } private: // The image also keeps a reference to VPS/SPS/PPS, because when decoding is delayed, // the currently active parameter sets in the decctx might already have been replaced // with new parameters. std::shared_ptr vps; std::shared_ptr sps; // the SPS used for decoding this image std::shared_ptr pps; // the PPS used for decoding this image MetaDataArray ctb_info; MetaDataArray cb_info; MetaDataArray pb_info; MetaDataArray intraPredMode; MetaDataArray intraPredModeC; MetaDataArray tu_info; MetaDataArray deblk_info; template void set_cb_blk(int x, int y, int log2BlkWidth, Func setter) { int cbX = x >> cb_info.log2unitSize; int cbY = y >> cb_info.log2unitSize; int width = 1 << (log2BlkWidth - cb_info.log2unitSize); for (int cby=cbY;cby> tu_info.log2unitSize; int tuY = y >> tu_info.log2unitSize; int width = 1 << (log2BlkWidth - tu_info.log2unitSize); for (int tuy=tuY;tuy> tu_info.log2unitSize; const int tuY = y >> tu_info.log2unitSize; const int width = 1 << (log2TrafoSize - tu_info.log2unitSize); for (int tuy=tuY;tuy 34) { ipm = 0; } return static_cast(ipm); } IntraPredMode get_IntraPredMode_atIndex(int idx) const { uint8_t ipm = intraPredMode[idx]; if (ipm > 34) { ipm = 0; } return static_cast(ipm); } void set_IntraPredMode(int PUidx,int log2blkSize, IntraPredMode mode) { int pbSize = 1<<(log2blkSize - intraPredMode.log2unitSize); for (int y=0;y>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs; for (int y=0;yPicWidthInMinPUs); assert(y < sps->PicHeightInMinPUs); int idx = PUidx + x + y*intraPredMode.width_in_units; assert(idx>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs; for (int y=0;yPicWidthInMinPUs); assert(yPicHeightInMinPUs); int idx = PUidx + x + y*intraPredModeC.width_in_units; assert(idx= ctb_info.width_in_units || ctbY >= ctb_info.height_in_units) { return; } int idx = ctbX + ctbY*ctb_info.width_in_units; ctb_info[idx].SliceAddrRS = SliceAddrRS; } int get_SliceAddrRS(int ctbX, int ctbY) const { return ctb_info[ctbX + ctbY*ctb_info.width_in_units].SliceAddrRS; } int get_SliceAddrRS_atCtbRS(int ctbRS) const { return ctb_info[ctbRS].SliceAddrRS; } void set_SliceHeaderIndex(int x, int y, int SliceHeaderIndex) { ctb_info.get(x,y).SliceHeaderIndex = SliceHeaderIndex; } uint16_t get_SliceHeaderIndex(int x, int y) const { return ctb_info.get(x,y).SliceHeaderIndex; } uint16_t get_SliceHeaderIndexCtb(int ctbX, int ctbY) const { return ctb_info[ctbX + ctbY*ctb_info.width_in_units].SliceHeaderIndex; } uint16_t get_SliceHeaderIndex_atIndex(int ctb) const { return ctb_info[ctb].SliceHeaderIndex; } bool is_SliceHeader_available(int x,int y) const { uint16_t idx = ctb_info.get(x,y).SliceHeaderIndex; return idx < slices.size(); } slice_segment_header* get_SliceHeader(int x, int y) { uint16_t idx = get_SliceHeaderIndex(x,y); if (idx >= slices.size()) { return nullptr; } return slices[idx]; } slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) { uint16_t idx = get_SliceHeaderIndexCtb(ctbX,ctbY); if (idx >= slices.size()) { return nullptr; } return slices[idx]; } const slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) const { uint16_t idx = get_SliceHeaderIndexCtb(ctbX,ctbY); if (idx >= slices.size()) { return nullptr; } return slices[idx]; } void set_sao_info(int ctbX,int ctbY,const sao_info* saoinfo) { sao_info* sao = &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo; memcpy(sao, saoinfo, sizeof(sao_info)); } const sao_info* get_sao_info(int ctbX,int ctbY) const { return &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo; } void set_CtbDeblockFlag(int ctbX, int ctbY, bool flag) { int idx = ctbX + ctbY*ctb_info.width_in_units; ctb_info[idx].deblock = flag; } bool get_CtbDeblockFlag(int ctbX, int ctbY) const { return ctb_info[ctbX + ctbY*ctb_info.width_in_units].deblock; } bool get_CTB_has_pcm_or_cu_transquant_bypass(int ctbX,int ctbY) const { int idx = ctbX + ctbY*ctb_info.width_in_units; return ctb_info[idx].has_pcm_or_cu_transquant_bypass; } // --- DEBLK metadata access --- int get_deblk_width() const { return deblk_info.width_in_units; } int get_deblk_height() const { return deblk_info.height_in_units; } void set_deblk_flags(int x0,int y0, uint8_t flags) { const int xd = x0/4; const int yd = y0/4; if (xd * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "intrapred.h" #include "transform.h" #include "util.h" #include #include #include void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], enum IntraPredMode candIntraPredModeA, enum IntraPredMode candIntraPredModeB) { // build candidate list if (candIntraPredModeA == candIntraPredModeB) { if (candIntraPredModeA < 2) { candModeList[0] = INTRA_PLANAR; candModeList[1] = INTRA_DC; candModeList[2] = INTRA_ANGULAR_26; } else { candModeList[0] = candIntraPredModeA; candModeList[1] = (enum IntraPredMode)(2 + ((candIntraPredModeA-2 -1 +32) % 32)); candModeList[2] = (enum IntraPredMode)(2 + ((candIntraPredModeA-2 +1 ) % 32)); } } else { candModeList[0] = candIntraPredModeA; candModeList[1] = candIntraPredModeB; if (candIntraPredModeA != INTRA_PLANAR && candIntraPredModeB != INTRA_PLANAR) { candModeList[2] = INTRA_PLANAR; } else if (candIntraPredModeA != INTRA_DC && candIntraPredModeB != INTRA_DC) { candModeList[2] = INTRA_DC; } else { candModeList[2] = INTRA_ANGULAR_26; } } /* printf("candModeList: %d %d %d\n", candModeList[0], candModeList[1], candModeList[2] ); */ } void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, int PUidx, bool availableA, // left bool availableB, // top const de265_image* img) { const seq_parameter_set* sps = &img->get_sps(); // block on left side enum IntraPredMode candIntraPredModeA, candIntraPredModeB; if (availableA==false) { candIntraPredModeA=INTRA_DC; } else if (img->get_pred_mode(x-1,y) != MODE_INTRA || img->get_pcm_flag (x-1,y)) { candIntraPredModeA=INTRA_DC; } else { candIntraPredModeA = img->get_IntraPredMode_atIndex(PUidx-1); } // block above if (availableB==false) { candIntraPredModeB=INTRA_DC; } else if (img->get_pred_mode(x,y-1) != MODE_INTRA || img->get_pcm_flag (x,y-1)) { candIntraPredModeB=INTRA_DC; } else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) { candIntraPredModeB=INTRA_DC; } else { candIntraPredModeB = img->get_IntraPredMode_atIndex(PUidx-sps->PicWidthInMinPUs); } logtrace(LogSlice,"%d;%d candA:%d / candB:%d\n", x,y, availableA ? candIntraPredModeA : -999, availableB ? candIntraPredModeB : -999); fillIntraPredModeCandidates(candModeList, candIntraPredModeA, candIntraPredModeB); } int find_intra_pred_mode(enum IntraPredMode mode, enum IntraPredMode candModeList[3]) { // check whether the mode is in the candidate list for (int i=0;i<3;i++) { if (candModeList[i] == mode) { return i; } } // sort candModeList if (candModeList[0] > candModeList[1]) { std::swap(candModeList[0],candModeList[1]); } if (candModeList[0] > candModeList[2]) { std::swap(candModeList[0],candModeList[2]); } if (candModeList[1] > candModeList[2]) { std::swap(candModeList[1],candModeList[2]); } // skip modes already in the candidate list int intraMode = mode; for (int i=2;i>=0;i--) { if (intraMode >= candModeList[i]) { intraMode--; } } return -intraMode-1; } #if 0 void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5], enum IntraPredMode luma_mode) { enum IntraPredMode chroma_cand[5]; chroma_cand[0] = INTRA_PLANAR; chroma_cand[1] = INTRA_ANGULAR_26; chroma_cand[2] = INTRA_ANGULAR_10; chroma_cand[3] = INTRA_DC; chroma_cand[4] = luma_mode; switch (luma_mode) { case INTRA_PLANAR: chroma_cand[0] = INTRA_ANGULAR_34; break; case INTRA_ANGULAR_26: chroma_cand[1] = INTRA_ANGULAR_34; break; case INTRA_ANGULAR_10: chroma_cand[2] = INTRA_ANGULAR_34; break; case INTRA_DC: chroma_cand[3] = INTRA_ANGULAR_34; break; default: // use defaults from above break; } } #endif int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx, const seq_parameter_set* sps) { if (log2TrafoSize==2 || (log2TrafoSize==3 && (cIdx==0 || sps->ChromaArrayType==CHROMA_444))) { /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; else return 0; } else { return 0; } } int get_intra_scan_idx_luma(int log2TrafoSize, enum IntraPredMode intraPredMode) { if (log2TrafoSize==2 || log2TrafoSize==3) { /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; else return 0; } else { return 0; } } int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode) { if (log2TrafoSize==1 || log2TrafoSize==2) { /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; else return 0; } else { return 0; } } enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma, enum IntraChromaPredMode chroma) { switch (chroma) { case INTRA_CHROMA_LIKE_LUMA: return luma; case INTRA_CHROMA_PLANAR_OR_34: if (luma==INTRA_PLANAR) return INTRA_ANGULAR_34; else return INTRA_PLANAR; case INTRA_CHROMA_ANGULAR_26_OR_34: if (luma==INTRA_ANGULAR_26) return INTRA_ANGULAR_34; else return INTRA_ANGULAR_26; case INTRA_CHROMA_ANGULAR_10_OR_34: if (luma==INTRA_ANGULAR_10) return INTRA_ANGULAR_34; else return INTRA_ANGULAR_10; case INTRA_CHROMA_DC_OR_34: if (luma==INTRA_DC) return INTRA_ANGULAR_34; else return INTRA_DC; } assert(false); return INTRA_DC; } // (8.4.4.2.2) template void fill_border_samples(de265_image* img, int xB,int yB, // in component specific resolution int nT, int cIdx, pixel_t* out_border) { intra_border_computer c; c.init(out_border, img, nT, cIdx, xB, yB); c.preproc(); c.fill_from_image(); c.reference_sample_substitution(); } const int intraPredAngle_table[1+34] = { 0, 0,32,26,21,17,13, 9, 5, 2, 0,-2,-5,-9,-13,-17,-21,-26, -32,-26,-21,-17,-13,-9,-5,-2,0,2,5,9,13,17,21,26,32 }; const int invAngle_table[25-10] = { -4096,-1638,-910,-630,-482,-390,-315,-256, -315,-390,-482,-630,-910,-1638,-4096 }; template void decode_intra_prediction_internal(de265_image* img, int xB0,int yB0, enum IntraPredMode intraPredMode, pixel_t* dst, int dstStride, int nT, int cIdx) { pixel_t border_pixels_mem[4*MAX_INTRA_PRED_BLOCK_SIZE+1]; pixel_t* border_pixels = &border_pixels_mem[2*MAX_INTRA_PRED_BLOCK_SIZE]; fill_border_samples(img, xB0,yB0, nT, cIdx, border_pixels); if (img->get_sps().range_extension.intra_smoothing_disabled_flag == 0 && (cIdx==0 || img->get_sps().ChromaArrayType==CHROMA_444)) { intra_prediction_sample_filtering(img->get_sps(), border_pixels, nT, cIdx, intraPredMode); } switch (intraPredMode) { case INTRA_PLANAR: intra_prediction_planar(dst,dstStride, nT,cIdx, border_pixels); break; case INTRA_DC: intra_prediction_DC(dst,dstStride, nT,cIdx, border_pixels); break; default: { int bit_depth = img->get_bit_depth(cIdx); bool disableIntraBoundaryFilter = (img->get_sps().range_extension.implicit_rdpcm_enabled_flag && img->get_cu_transquant_bypass(xB0,yB0)); intra_prediction_angular(dst,dstStride, bit_depth,disableIntraBoundaryFilter, xB0,yB0,intraPredMode,nT,cIdx, border_pixels); } break; } } // (8.4.4.2.1) void decode_intra_prediction(de265_image* img, int xB0,int yB0, enum IntraPredMode intraPredMode, int nT, int cIdx) { logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n", xB0,yB0, intraPredMode, nT,cIdx); /* printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n", xB0,yB0, intraPredMode, nT,cIdx); */ if (img->high_bit_depth(cIdx)) { decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, img->get_image_plane_at_pos_NEW(cIdx,xB0,yB0), img->get_image_stride(cIdx), nT,cIdx); } else { decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, img->get_image_plane_at_pos_NEW(cIdx,xB0,yB0), img->get_image_stride(cIdx), nT,cIdx); } } // TODO: remove this template <> void decode_intra_prediction(de265_image* img, int xB0,int yB0, enum IntraPredMode intraPredMode, uint8_t* dst, int nT, int cIdx) { decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, dst,nT, nT,cIdx); } // TODO: remove this template <> void decode_intra_prediction(de265_image* img, int xB0,int yB0, enum IntraPredMode intraPredMode, uint16_t* dst, int nT, int cIdx) { decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, dst,nT, nT,cIdx); } libde265-1.0.18/libde265/intrapred.h000066400000000000000000000474431515675107500166220ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_INTRAPRED_H #define DE265_INTRAPRED_H #include "libde265/decctx.h" extern const int intraPredAngle_table[1+34]; /* Fill the three intra-pred-mode candidates into candModeList. Block position is (x,y) and you also have to give the PUidx for this block (which is (x>>Log2MinPUSize) + (y>>Log2MinPUSize)*PicWidthInMinPUs). availableA/B is the output of check_CTB_available(). */ void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, int PUidx, bool availableA, // left bool availableB, // top const de265_image* img); inline void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, bool availableA, // left bool availableB, // top const de265_image* img) { int PUidx = img->get_sps().getPUIndexRS(x,y); fillIntraPredModeCandidates(candModeList, x,y, PUidx, availableA,availableB, img); } void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], enum IntraPredMode candIntraPredModeA, enum IntraPredMode candIntraPredModeB); /* Return value >= 0 -> use mpm_idx(return value) else -> use rem_intra(-return value-1) This function may modify the candModeList ! */ int find_intra_pred_mode(enum IntraPredMode mode, enum IntraPredMode candModeList[3]); void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5], enum IntraPredMode luma_mode); int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx, const seq_parameter_set* sps); int get_intra_scan_idx_luma (int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma, enum IntraChromaPredMode chroma); /* void decode_intra_block(decoder_context* ctx, thread_context* tctx, int cIdx, int xB0,int yB0, // position of TU in frame (chroma adapted) int x0,int y0, // position of CU in frame (chroma adapted) int log2TrafoSize, int trafoDepth, enum IntraPredMode intraPredMode, bool transform_skip_flag); */ //void fill_border_samples(decoder_context* ctx, int xB,int yB, // int nT, int cIdx, uint8_t* out_border); void decode_intra_prediction(de265_image* img, int xB0,int yB0, enum IntraPredMode intraPredMode, int nT, int cIdx); // TODO: remove this template void decode_intra_prediction(de265_image* img, int xB0,int yB0, enum IntraPredMode intraPredMode, pixel_t* dst, int nT, int cIdx); // --- internal use only --- // Actually, the largest TB block can only be 32, but in some intra-pred-mode algorithms // (e.g. min-residual), we may call intra prediction on the maximum CTB size (64). static const int MAX_INTRA_PRED_BLOCK_SIZE = 64; template class intra_border_computer { public: pixel_t* out_border; const de265_image* img; int nT; int cIdx; int xB,yB; const seq_parameter_set* sps; const pic_parameter_set* pps; uint8_t available_data[4*MAX_INTRA_PRED_BLOCK_SIZE + 1]; uint8_t* available; int SubWidth; int SubHeight; bool availableLeft; // is CTB at left side available? bool availableTop; // is CTB at top side available? bool availableTopRight; // is CTB at top-right side available? bool availableTopLeft; // if CTB at top-left pixel available? int nBottom; int nRight; int nAvail; pixel_t firstValue; void init(pixel_t* _out_border, const de265_image* _img, int _nT, int _cIdx, int _xB, int _yB) { img=_img; nT=_nT; cIdx=_cIdx; out_border=_out_border; xB=_xB; yB=_yB; assert(nT <= MAX_INTRA_PRED_BLOCK_SIZE); availableLeft=true; availableTop=true; availableTopRight=true; availableTopLeft=true; } void preproc(); void fill_from_image(); void reference_sample_substitution(); }; #ifdef DE265_LOG_TRACE template void print_border(pixel_t* data, uint8_t* available, int nT) { for (int i=-2*nT ; i<=2*nT ; i++) { if (i==0 || i==1 || i==-nT || i==nT+1) { logtrace(LogIntraPred,"|"); } else { logtrace(LogIntraPred," "); } if (available==nullptr || available[i]) { logtrace(LogIntraPred,"%02x",data[i]); } else { logtrace(LogIntraPred,"--"); } } } #else #define print_border(data, available, nT) #endif // (8.4.4.2.3) template void intra_prediction_sample_filtering(const seq_parameter_set& sps, pixel_t* p, int nT, int cIdx, enum IntraPredMode intraPredMode) { int filterFlag; //printf("filtering, mode: %d\n",intraPredMode); if (intraPredMode==INTRA_DC || nT==4) { filterFlag = 0; } else { // int-cast below prevents a typing problem that leads to wrong results when abs_value is a macro int minDistVerHor = libde265_min( abs_value((int)intraPredMode-26), abs_value((int)intraPredMode-10) ); //printf("mindist: %d\n",minDistVerHor); switch (nT) { case 8: filterFlag = (minDistVerHor>7) ? 1 : 0; break; case 16: filterFlag = (minDistVerHor>1) ? 1 : 0; break; case 32: filterFlag = (minDistVerHor>0) ? 1 : 0; break; // there is no official 64x64 TB block, but we call this for some intra-pred mode algorithms // on the whole CB (2Nx2N mode for the whole CTB) case 64: filterFlag = 0; break; default: filterFlag = -1; assert(false); break; // should never happen } } if (filterFlag) { int biIntFlag = (sps.strong_intra_smoothing_enable_flag && cIdx==0 && nT==32 && abs_value(p[0]+p[ 64]-2*p[ 32]) < (1<<(sps.bit_depth_luma-5)) && abs_value(p[0]+p[-64]-2*p[-32]) < (1<<(sps.bit_depth_luma-5))) ? 1 : 0; pixel_t pF_mem[4*32+1]; pixel_t* pF = &pF_mem[2*32]; if (biIntFlag) { pF[-2*nT] = p[-2*nT]; pF[ 2*nT] = p[ 2*nT]; pF[ 0] = p[ 0]; for (int i=1;i<=63;i++) { pF[-i] = p[0] + ((i*(p[-64]-p[0])+32)>>6); pF[ i] = p[0] + ((i*(p[ 64]-p[0])+32)>>6); } } else { pF[-2*nT] = p[-2*nT]; pF[ 2*nT] = p[ 2*nT]; for (int i=-(2*nT-1) ; i<=2*nT-1 ; i++) { pF[i] = (p[i+1] + 2*p[i] + p[i-1] + 2) >> 2; } } // copy back to original array memcpy(p-2*nT, pF-2*nT, (4*nT+1) * sizeof(pixel_t)); } else { // do nothing ? } logtrace(LogIntraPred,"post filtering: "); print_border(p,nullptr,nT); logtrace(LogIntraPred,"\n"); } template void intra_prediction_planar(pixel_t* dst, int dstStride, int nT,int cIdx, pixel_t* border) { int Log2_nT = Log2(nT); for (int y=0;y> (Log2_nT+1); } logtrace(LogIntraPred,"result of planar prediction\n"); for (int y=0;y void intra_prediction_DC(pixel_t* dst, int dstStride, int nT,int cIdx, pixel_t* border) { int Log2_nT = Log2(nT); int dcVal = 0; for (int i=0;i>= Log2_nT+1; if (cIdx==0 && nT<32) { dst[0] = (border[-1] + 2*dcVal + border[1] +2) >> 2; for (int x=1;x>2; } for (int y=1;y>2; } for (int y=1;y void intra_prediction_angular(pixel_t* dst, int dstStride, int bit_depth, bool disableIntraBoundaryFilter, int xB0,int yB0, enum IntraPredMode intraPredMode, int nT,int cIdx, pixel_t* border) { pixel_t ref_mem[4*MAX_INTRA_PRED_BLOCK_SIZE+1]; // TODO: what is the required range here ? pixel_t* ref=&ref_mem[2*MAX_INTRA_PRED_BLOCK_SIZE]; assert(intraPredMode<35); assert(intraPredMode>=2); int intraPredAngle = intraPredAngle_table[intraPredMode]; if (intraPredMode >= 18) { for (int x=0;x<=nT;x++) { ref[x] = border[x]; } if (intraPredAngle<0) { int invAngle = invAngle_table[intraPredMode-11]; if ((nT*intraPredAngle)>>5 < -1) { for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) { ref[x] = border[0-((x*invAngle+128)>>8)]; } } } else { for (int x=nT+1; x<=2*nT;x++) { ref[x] = border[x]; } } for (int y=0;y>5; int iFact= ((y+1)*intraPredAngle)&31; if (iFact != 0) { dst[x+y*dstStride] = ((32-iFact)*ref[x+iIdx+1] + iFact*ref[x+iIdx+2] + 16)>>5; } else { dst[x+y*dstStride] = ref[x+iIdx+1]; } } if (intraPredMode==26 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) { for (int y=0;y>1), bit_depth); } } } else { // intraPredAngle < 18 for (int x=0;x<=nT;x++) { ref[x] = border[-x]; } // DIFF (neg) if (intraPredAngle<0) { int invAngle = invAngle_table[intraPredMode-11]; if ((nT*intraPredAngle)>>5 < -1) { for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) { ref[x] = border[((x*invAngle+128)>>8)]; // DIFF (neg) } } } else { for (int x=nT+1; x<=2*nT;x++) { ref[x] = border[-x]; // DIFF (neg) } } for (int y=0;y>5; // DIFF (x<->y) int iFact= ((x+1)*intraPredAngle)&31; // DIFF (x<->y) if (iFact != 0) { dst[x+y*dstStride] = ((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5; // DIFF (x<->y) } else { dst[x+y*dstStride] = ref[y+iIdx+1]; // DIFF (x<->y) } } if (intraPredMode==10 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) { // DIFF 26->10 for (int x=0;xy) dst[x] = Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth); // DIFF (x<->y && neg) } } } logtrace(LogIntraPred,"result of angular intra prediction (mode=%d):\n",intraPredMode); for (int y=0;y void intra_border_computer::preproc() { sps = &img->get_sps(); pps = &img->get_pps(); SubWidth = (cIdx==0) ? 1 : sps->SubWidthC; SubHeight = (cIdx==0) ? 1 : sps->SubHeightC; // --- check for CTB boundaries --- int xBLuma = xB * SubWidth; int yBLuma = yB * SubHeight; int log2CtbSize = sps->Log2CtbSizeY; int picWidthInCtbs = sps->PicWidthInCtbsY; //printf("xB/yB: %d %d\n",xB,yB); // are we at left image border if (xBLuma == 0) { availableLeft = false; availableTopLeft = false; xBLuma = 0; // fake value, available flags are already set to false } // are we at top image border if (yBLuma == 0) { availableTop = false; availableTopLeft = false; availableTopRight = false; yBLuma = 0; // fake value, available flags are already set to false } if (xBLuma+nT*SubWidth >= sps->pic_width_in_luma_samples) { availableTopRight=false; } // check for tile and slice boundaries int xCurrCtb = xBLuma >> log2CtbSize; int yCurrCtb = yBLuma >> log2CtbSize; int xLeftCtb = (xBLuma-1) >> log2CtbSize; int xRightCtb = (xBLuma+nT*SubWidth) >> log2CtbSize; int yTopCtb = (yBLuma-1) >> log2CtbSize; int currCTBSlice = img->get_SliceAddrRS(xCurrCtb,yCurrCtb); int leftCTBSlice = availableLeft ? img->get_SliceAddrRS(xLeftCtb, yCurrCtb) : -1; int topCTBSlice = availableTop ? img->get_SliceAddrRS(xCurrCtb, yTopCtb) : -1; int toprightCTBSlice = availableTopRight ? img->get_SliceAddrRS(xRightCtb, yTopCtb) : -1; int topleftCTBSlice = availableTopLeft ? img->get_SliceAddrRS(xLeftCtb, yTopCtb) : -1; /* printf("size: %d\n",pps->TileIdRS.size()); printf("curr: %d left: %d top: %d\n", xCurrCtb+yCurrCtb*picWidthInCtbs, availableLeft ? xLeftCtb+yCurrCtb*picWidthInCtbs : 9999, availableTop ? xCurrCtb+yTopCtb*picWidthInCtbs : 9999); */ uint32_t currCTBTileID = pps->TileIdRS[xCurrCtb+yCurrCtb*picWidthInCtbs]; uint32_t leftCTBTileID = availableLeft ? pps->TileIdRS[xLeftCtb+yCurrCtb*picWidthInCtbs] : UINT32_MAX; uint32_t topCTBTileID = availableTop ? pps->TileIdRS[xCurrCtb+yTopCtb*picWidthInCtbs] : UINT32_MAX; uint32_t topleftCTBTileID = availableTopLeft ? pps->TileIdRS[xLeftCtb+yTopCtb*picWidthInCtbs] : UINT32_MAX; uint32_t toprightCTBTileID= availableTopRight? pps->TileIdRS[xRightCtb+yTopCtb*picWidthInCtbs] : UINT32_MAX; if (leftCTBSlice != currCTBSlice || leftCTBTileID != currCTBTileID ) availableLeft = false; if (topCTBSlice != currCTBSlice || topCTBTileID != currCTBTileID ) availableTop = false; if (topleftCTBSlice !=currCTBSlice||topleftCTBTileID!=currCTBTileID ) availableTopLeft = false; if (toprightCTBSlice!=currCTBSlice||toprightCTBTileID!=currCTBTileID) availableTopRight= false; // number of pixels that are in the valid image area to the right and to the bottom nBottom = sps->pic_height_in_luma_samples - yB*SubHeight; nBottom=(nBottom+SubHeight-1)/SubHeight; if (nBottom>2*nT) nBottom=2*nT; nRight = sps->pic_width_in_luma_samples - xB*SubWidth; nRight =(nRight +SubWidth-1)/SubWidth; if (nRight >2*nT) nRight=2*nT; nAvail=0; available = &available_data[2*MAX_INTRA_PRED_BLOCK_SIZE]; memset(available-2*nT, 0, 4*nT+1); } template void intra_border_computer::fill_from_image() { assert(nT<=32); pixel_t* image; int stride; image = (pixel_t*)img->get_image_plane(cIdx); stride = img->get_image_stride(cIdx); int xBLuma = xB * SubWidth; int yBLuma = yB * SubHeight; int currBlockAddr = pps->MinTbAddrZS[ (xBLuma>>sps->Log2MinTrafoSize) + (yBLuma>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; // copy pixels at left column for (int y=nBottom-1 ; y>=0 ; y-=4) if (availableLeft) { int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) + (((yB+y)*SubHeight)>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; bool availableN = NBlockAddr <= currBlockAddr; if (pps->constrained_intra_pred_flag) { if (img->get_pred_mode((xB-1)*SubWidth,(yB+y)*SubHeight)!=MODE_INTRA) availableN = false; } if (availableN) { if (!nAvail) firstValue = image[xB-1 + (yB+y)*stride]; for (int i=0;i<4;i++) { available[-y+i-1] = availableN; out_border[-y+i-1] = image[xB-1 + (yB+y-i)*stride]; } nAvail+=4; } } // copy pixel at top-left position if (availableTopLeft) { int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) + (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; bool availableN = NBlockAddr <= currBlockAddr; if (pps->constrained_intra_pred_flag) { if (img->get_pred_mode((xB-1)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) { availableN = false; } } if (availableN) { if (!nAvail) firstValue = image[xB-1 + (yB-1)*stride]; out_border[0] = image[xB-1 + (yB-1)*stride]; available[0] = availableN; nAvail++; } } // copy pixels at top row for (int x=0 ; xMinTbAddrZS[ (((xB+x)*SubWidth )>>sps->Log2MinTrafoSize) + (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; bool availableN = NBlockAddr <= currBlockAddr; if (pps->constrained_intra_pred_flag) { if (img->get_pred_mode((xB+x)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) { availableN = false; } } if (availableN) { if (!nAvail) firstValue = image[xB+x + (yB-1)*stride]; for (int i=0;i<4;i++) { out_border[x+i+1] = image[xB+x+i + (yB-1)*stride]; available[x+i+1] = availableN; } nAvail+=4; } } } } template void intra_border_computer::reference_sample_substitution() { // reference sample substitution const int bit_depth = img->get_bit_depth(cIdx); if (nAvail!=4*nT+1) { if (nAvail==0) { if (sizeof(pixel_t)==1) { memset(out_border-2*nT, 1<<(bit_depth-1), 4*nT+1); } else { for (int i = -2*nT; i <= 2*nT ; i++) { out_border[i] = 1<<(bit_depth-1); } } } else { if (!available[-2*nT]) { out_border[-2*nT] = firstValue; } for (int i=-2*nT+1; i<=2*nT; i++) if (!available[i]) { out_border[i]=out_border[i-1]; } } } logtrace(LogIntraPred,"availableN: "); print_border(available,nullptr,nT); logtrace(LogIntraPred,"\n"); logtrace(LogIntraPred,"output: "); print_border(out_border,nullptr,nT); logtrace(LogIntraPred,"\n"); } #endif libde265-1.0.18/libde265/md5.cc000066400000000000000000000203671515675107500154510ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * (This is a heavily cut-down "BSD license".) * * This differs from Colin Plumb's older public domain implementation in that * no exactly 32-bit integer data type is required (any 32-bit or wider * unsigned integer data type will do), there's no compile-time endianness * configuration, and the function prototypes match OpenSSL's. No code from * Colin Plumb's implementation has been reused; this comment merely compares * the properties of the two independent implementations. * * The primary goals of this implementation are portability and ease of use. * It is meant to be fast, but not as fast as possible. Some known * optimizations are not included to reduce source code size and avoid * compile-time configuration. */ #ifndef HAVE_OPENSSL #include #include "md5.h" /* * The basic MD5 functions. * * F and G are optimized compared to their RFC 1321 definitions for * architectures that lack an AND-NOT instruction, just like in Colin Plumb's * implementation. */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | ~(z))) /* * The MD5 transformation for all four rounds. */ #define STEP(f, a, b, c, d, x, t, s) \ (a) += f((b), (c), (d)) + (x) + (t); \ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ (a) += (b); /* * SET reads 4 input bytes in little-endian byte order and stores them * in a properly aligned word in host byte order. * * The check for little-endian architectures that tolerate unaligned * memory accesses is just an optimization. Nothing will break if it * doesn't work. */ #if defined(__i386__) || defined(__x86_64__) || defined(__vax__) #define SET(n) \ (*(MD5_u32plus *)&ptr[(n) * 4]) #define GET(n) \ SET(n) #else #define SET(n) \ (ctx->block[(n)] = \ (MD5_u32plus)ptr[(n) * 4] | \ ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) #define GET(n) \ (ctx->block[(n)]) #endif /* * This processes one or more 64-byte data blocks, but does NOT update * the bit counters. There are no alignment requirements. */ static void *body(MD5_CTX *ctx, void *data, unsigned long size) { unsigned char *ptr; MD5_u32plus a, b, c, d; MD5_u32plus saved_a, saved_b, saved_c, saved_d; ptr = (unsigned char *)data; a = ctx->a; b = ctx->b; c = ctx->c; d = ctx->d; do { saved_a = a; saved_b = b; saved_c = c; saved_d = d; /* Round 1 */ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) STEP(F, c, d, a, b, SET(2), 0x242070db, 17) STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) /* Round 2 */ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) STEP(G, d, a, b, c, GET(10), 0x02441453, 9) STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) /* Round 3 */ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) /* Round 4 */ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) a += saved_a; b += saved_b; c += saved_c; d += saved_d; ptr += 64; } while (size -= 64); ctx->a = a; ctx->b = b; ctx->c = c; ctx->d = d; return ptr; } void MD5_Init(MD5_CTX *ctx) { ctx->a = 0x67452301; ctx->b = 0xefcdab89; ctx->c = 0x98badcfe; ctx->d = 0x10325476; ctx->lo = 0; ctx->hi = 0; } void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) { MD5_u32plus saved_lo; unsigned long used, free; saved_lo = ctx->lo; if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) ctx->hi++; ctx->hi += size >> 29; used = saved_lo & 0x3f; if (used) { free = 64 - used; if (size < free) { memcpy(&ctx->buffer[used], data, size); return; } memcpy(&ctx->buffer[used], data, free); data = (unsigned char *)data + free; size -= free; body(ctx, ctx->buffer, 64); } if (size >= 64) { data = body(ctx, data, size & ~(unsigned long)0x3f); size &= 0x3f; } memcpy(ctx->buffer, data, size); } void MD5_Final(unsigned char *result, MD5_CTX *ctx) { unsigned long used, free; used = ctx->lo & 0x3f; ctx->buffer[used++] = 0x80; free = 64 - used; if (free < 8) { memset(&ctx->buffer[used], 0, free); body(ctx, ctx->buffer, 64); used = 0; free = 64; } memset(&ctx->buffer[used], 0, free - 8); ctx->lo <<= 3; ctx->buffer[56] = ctx->lo; ctx->buffer[57] = ctx->lo >> 8; ctx->buffer[58] = ctx->lo >> 16; ctx->buffer[59] = ctx->lo >> 24; ctx->buffer[60] = ctx->hi; ctx->buffer[61] = ctx->hi >> 8; ctx->buffer[62] = ctx->hi >> 16; ctx->buffer[63] = ctx->hi >> 24; body(ctx, ctx->buffer, 64); result[0] = ctx->a; result[1] = ctx->a >> 8; result[2] = ctx->a >> 16; result[3] = ctx->a >> 24; result[4] = ctx->b; result[5] = ctx->b >> 8; result[6] = ctx->b >> 16; result[7] = ctx->b >> 24; result[8] = ctx->c; result[9] = ctx->c >> 8; result[10] = ctx->c >> 16; result[11] = ctx->c >> 24; result[12] = ctx->d; result[13] = ctx->d >> 8; result[14] = ctx->d >> 16; result[15] = ctx->d >> 24; memset(ctx, 0, sizeof(*ctx)); } #endif libde265-1.0.18/libde265/md5.h000066400000000000000000000025741515675107500153130ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * See md5.c for more information. */ #ifdef HAVE_OPENSSL #include #elif !defined(_MD5_H) #define _MD5_H /* Any 32-bit or wider unsigned integer data type will do */ typedef unsigned int MD5_u32plus; typedef struct { MD5_u32plus lo, hi; MD5_u32plus a, b, c, d; unsigned char buffer[64]; MD5_u32plus block[16]; } MD5_CTX; extern void MD5_Init(MD5_CTX *ctx); extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); #endif libde265-1.0.18/libde265/motion.cc000066400000000000000000002150101515675107500162600ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "motion.h" #include "decctx.h" #include "util.h" #include "dpb.h" #include #include #include #include #if defined(_MSC_VER) || defined(__MINGW32__) # include #elif defined(HAVE_ALLOCA_H) # include #endif #define MAX_CU_SIZE 64 static int extra_before[4] = { 0,3,3,2 }; static int extra_after [4] = { 0,3,4,4 }; template void mc_luma(const base_context* ctx, const seq_parameter_set* sps, int mv_x, int mv_y, int xP,int yP, int16_t* out, int out_stride, const pixel_t* ref, int ref_stride, int nPbW, int nPbH, int bitDepth_L) { int xFracL = mv_x & 3; int yFracL = mv_y & 3; int xIntOffsL = xP + (mv_x>>2); int yIntOffsL = yP + (mv_y>>2); // luma sample interpolation process (8.5.3.2.2.1) //const int shift1 = sps->BitDepth_Y-8; //const int shift2 = 6; const int shift3 = 14 - sps->BitDepth_Y; int w = sps->pic_width_in_luma_samples; int h = sps->pic_height_in_luma_samples; ALIGNED_16(int16_t) mcbuffer[MAX_CU_SIZE * (MAX_CU_SIZE+7)]; if (xFracL==0 && yFracL==0) { if (xIntOffsL >= 0 && yIntOffsL >= 0 && nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) { ctx->acceleration.put_hevc_qpel(out, out_stride, &ref[yIntOffsL*ref_stride + xIntOffsL], ref_stride /* sizeof(pixel_t)*/, nPbW,nPbH, mcbuffer, 0,0, bitDepth_L); } else { for (int y=0;y \n"); for (int y=0;y> 6); // 6 will be used when summing predictions } logtrace(LogMotion,"\n"); } #endif } else { int extra_left = extra_before[xFracL]; int extra_right = extra_after [xFracL]; int extra_top = extra_before[yFracL]; int extra_bottom = extra_after [yFracL]; //int nPbW_extra = extra_left + nPbW + extra_right; //int nPbH_extra = extra_top + nPbH + extra_bottom; pixel_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)]; const pixel_t* src_ptr; int src_stride; if (-extra_left + xIntOffsL >= 0 && -extra_top + yIntOffsL >= 0 && nPbW+extra_right + xIntOffsL < w && nPbH+extra_bottom + yIntOffsL < h) { src_ptr = &ref[xIntOffsL + yIntOffsL*ref_stride]; src_stride = ref_stride; } else { // Extend fill width to a multiple of 16 so that SIMD over-reads // in qpel interpolation hit valid (edge-clamped) data. int fill_width = ((extra_left + nPbW + extra_right + 15) & ~15); if (fill_width > MAX_CU_SIZE+16) fill_width = MAX_CU_SIZE+16; for (int y=-extra_top;yacceleration.put_hevc_qpel(out, out_stride, src_ptr, src_stride /* sizeof(pixel_t) */, nPbW,nPbH, mcbuffer, xFracL,yFracL, bitDepth_L); logtrace(LogMotion,"---V---\n"); for (int y=0;y void mc_chroma(const base_context* ctx, const seq_parameter_set* sps, int mv_x, int mv_y, int xP,int yP, int16_t* out, int out_stride, const pixel_t* ref, int ref_stride, int nPbWC, int nPbHC, int bit_depth_C) { // chroma sample interpolation process (8.5.3.2.2.2) //const int shift1 = sps->BitDepth_C-8; //const int shift2 = 6; const int shift3 = 14 - sps->BitDepth_C; int wC = sps->pic_width_in_luma_samples /sps->SubWidthC; int hC = sps->pic_height_in_luma_samples/sps->SubHeightC; mv_x *= 2 / sps->SubWidthC; mv_y *= 2 / sps->SubHeightC; int xFracC = mv_x & 7; int yFracC = mv_y & 7; int xIntOffsC = xP/sps->SubWidthC + (mv_x>>3); int yIntOffsC = yP/sps->SubHeightC + (mv_y>>3); ALIGNED_32(int16_t mcbuffer[MAX_CU_SIZE*(MAX_CU_SIZE+7)]); if (xFracC == 0 && yFracC == 0) { if (xIntOffsC>=0 && nPbWC+xIntOffsC<=wC && yIntOffsC>=0 && nPbHC+yIntOffsC<=hC) { ctx->acceleration.put_hevc_epel(out, out_stride, &ref[xIntOffsC + yIntOffsC*ref_stride], ref_stride, nPbWC,nPbHC, 0,0, nullptr, bit_depth_C); } else { for (int y=0;y=1 && nPbWC+xIntOffsC<=wC-2 && yIntOffsC>=1 && nPbHC+yIntOffsC<=hC-2) { src_ptr = &ref[xIntOffsC + yIntOffsC*ref_stride]; src_stride = ref_stride; } else { // Extend fill width to a multiple of 16 so that SIMD over-reads // in epel interpolation hit valid (edge-clamped) data. int fill_width = ((extra_left + nPbWC + extra_right + 15) & ~15); if (fill_width > MAX_CU_SIZE+16) fill_width = MAX_CU_SIZE+16; for (int y=-extra_top;yacceleration.put_hevc_epel_hv(out, out_stride, src_ptr, src_stride, nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); } else if (xFracC) { ctx->acceleration.put_hevc_epel_h(out, out_stride, src_ptr, src_stride, nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); } else if (yFracC) { ctx->acceleration.put_hevc_epel_v(out, out_stride, src_ptr, src_stride, nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); } else { assert(false); // full-pel shifts are handled above } } } // 8.5.3.2 // NOTE: for full-pel shifts, we can introduce a fast path, simply copying without shifts void generate_inter_prediction_samples(base_context* ctx, const slice_segment_header* shdr, de265_image* img, int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, const PBMotion* vi) { int xP = xC+xB; int yP = yC+yB; void* pixels[3]; int stride[3]; const pic_parameter_set* pps = shdr->pps.get(); const seq_parameter_set* sps = pps->sps.get(); if (sps->BitDepth_Y != img->get_bit_depth(0) || sps->BitDepth_C != img->get_bit_depth(1)) { img->integrity = INTEGRITY_DECODING_ERRORS; ctx->add_warning(DE265_WARNING_BIT_DEPTH_OF_CURRENT_IMAGE_DOES_NOT_MATCH_SPS, false); return; } if (sps->chroma_format_idc != img->get_chroma_format()) { img->integrity = INTEGRITY_DECODING_ERRORS; ctx->add_warning(DE265_WARNING_CHROMA_OF_CURRENT_IMAGE_DOES_NOT_MATCH_SPS, false); return; } const int SubWidthC = sps->SubWidthC; const int SubHeightC = sps->SubHeightC; pixels[0] = img->get_image_plane_at_pos_any_depth(0,xP,yP); stride[0] = img->get_image_stride(0); pixels[1] = img->get_image_plane_at_pos_any_depth(1,xP/SubWidthC,yP/SubHeightC); stride[1] = img->get_image_stride(1); pixels[2] = img->get_image_plane_at_pos_any_depth(2,xP/SubWidthC,yP/SubHeightC); stride[2] = img->get_image_stride(2); ALIGNED_16(int16_t) predSamplesL [2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE]; ALIGNED_16(int16_t) predSamplesC[2 /* chroma */ ][2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE]; //int xP = xC+xB; //int yP = yC+yB; int predFlag[2]; predFlag[0] = vi->predFlag[0]; predFlag[1] = vi->predFlag[1]; const int bit_depth_L = sps->BitDepth_Y; const int bit_depth_C = sps->BitDepth_C; // Some encoders use bi-prediction with two similar MVs. // Identify this case and use only one MV. // do this only without weighted prediction, because the weights/offsets may be different if (pps->weighted_pred_flag==0) { if (predFlag[0] && predFlag[1]) { if (vi->mv[0].x == vi->mv[1].x && vi->mv[0].y == vi->mv[1].y && shdr->RefPicList[0][vi->refIdx[0]] == shdr->RefPicList[1][vi->refIdx[1]]) { predFlag[1] = 0; } } } // Fill prediction samples with mid-grey in intermediate precision. // Used on error paths where the reference picture is unavailable or mismatched. auto fill_pred_samples = [&](int l) { const int16_t fill = 1 << 13; // mid-grey: (1 << (bd-1)) << (14-bd) for any bd for (int y = 0; y < nPbH; y++) for (int x = 0; x < nPbW; x++) predSamplesL[l][y * nCS + x] = fill; if (img->get_chroma_format() != de265_chroma_mono) { int cW = nPbW / SubWidthC; int cH = nPbH / SubHeightC; for (int y = 0; y < cH; y++) for (int x = 0; x < cW; x++) { predSamplesC[0][l][y * nCS + x] = fill; predSamplesC[1][l][y * nCS + x] = fill; } } }; for (int l=0;l<2;l++) { if (predFlag[l]) { // 8.5.3.2.1 const de265_image* refPic = ctx->get_image(shdr->RefPicList[l][vi->refIdx[l]]); logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->refIdx[l], shdr->RefPicList[l][vi->refIdx[l]]); if (!refPic || refPic->PicState == UnusedForReference) { img->integrity = INTEGRITY_DECODING_ERRORS; ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); fill_pred_samples(l); } else if (refPic->get_width(0) != sps->pic_width_in_luma_samples || refPic->get_height(0) != sps->pic_height_in_luma_samples || img->get_chroma_format() != refPic->get_chroma_format()) { img->integrity = INTEGRITY_DECODING_ERRORS; ctx->add_warning(DE265_WARNING_REFERENCE_IMAGE_SIZE_DOES_NOT_MATCH_SPS, false); fill_pred_samples(l); } else if (img->get_bit_depth(0) != refPic->get_bit_depth(0) || img->get_bit_depth(1) != refPic->get_bit_depth(1)) { img->integrity = INTEGRITY_DECODING_ERRORS; ctx->add_warning(DE265_WARNING_REFERENCE_IMAGE_BIT_DEPTH_DOES_NOT_MATCH, false); fill_pred_samples(l); } else if (img->get_chroma_format() != refPic->get_chroma_format()) { img->integrity = INTEGRITY_DECODING_ERRORS; ctx->add_warning(DE265_WARNING_REFERENCE_IMAGE_CHROMA_FORMAT_DOES_NOT_MATCH, false); fill_pred_samples(l); } else { // 8.5.3.2.2 logtrace(LogMotion,"do MC: L%d,MV=%d;%d RefPOC=%d\n", l,vi->mv[l].x,vi->mv[l].y,refPic->PicOrderCntVal); // TODO: must predSamples stride really be nCS or can it be something smaller like nPbW? if (img->high_bit_depth(0)) { mc_luma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, predSamplesL[l],nCS, (const uint16_t*)refPic->get_image_plane(0), refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L); } else { mc_luma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, predSamplesL[l],nCS, (const uint8_t*)refPic->get_image_plane(0), refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L); } if (img->get_chroma_format() != de265_chroma_mono) { if (img->high_bit_depth(1)) { mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP, yP, predSamplesC[0][l], nCS, (const uint16_t*) refPic->get_image_plane(1), refPic->get_chroma_stride(), nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP, yP, predSamplesC[1][l], nCS, (const uint16_t*) refPic->get_image_plane(2), refPic->get_chroma_stride(), nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); } else { mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP, yP, predSamplesC[0][l], nCS, (const uint8_t*) refPic->get_image_plane(1), refPic->get_chroma_stride(), nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP, yP, predSamplesC[1][l], nCS, (const uint8_t*) refPic->get_image_plane(2), refPic->get_chroma_stride(), nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); } } } } } // weighted sample prediction (8.5.3.2.3) const int shift1_L = libde265_max(2,14-sps->BitDepth_Y); const int offset_shift1_L = img->get_sps().WpOffsetBdShiftY; const int shift1_C = libde265_max(2,14-sps->BitDepth_C); const int offset_shift1_C = img->get_sps().WpOffsetBdShiftC; /* const int shift1_L = 14-img->sps.BitDepth_Y; const int offset_shift1_L = img->sps.BitDepth_Y-8; const int shift1_C = 14-img->sps.BitDepth_C; const int offset_shift1_C = img->sps.BitDepth_C-8; */ /* if (0) printf("%d/%d %d/%d %d/%d %d/%d\n", shift1_L, Nshift1_L, offset_shift1_L, Noffset_shift1_L, shift1_C, Nshift1_C, offset_shift1_C, Noffset_shift1_C); assert(shift1_L== Nshift1_L); assert(offset_shift1_L== Noffset_shift1_L); assert(shift1_C== Nshift1_C); assert(offset_shift1_C== Noffset_shift1_C); */ logtrace(LogMotion,"predFlags (modified): %d %d\n", predFlag[0], predFlag[1]); if (shdr->slice_type == SLICE_TYPE_P) { if (pps->weighted_pred_flag==0) { if (predFlag[0]==1 && predFlag[1]==0) { ctx->acceleration.put_unweighted_pred(pixels[0], stride[0], predSamplesL[0],nCS, nPbW,nPbH, bit_depth_L); if (img->get_chroma_format() != de265_chroma_mono) { ctx->acceleration.put_unweighted_pred(pixels[1], stride[1], predSamplesC[0][0], nCS, nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); ctx->acceleration.put_unweighted_pred(pixels[2], stride[2], predSamplesC[1][0], nCS, nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); } } else { ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); img->integrity = INTEGRITY_DECODING_ERRORS; } } else { // weighted prediction if (predFlag[0]==1 && predFlag[1]==0) { int refIdx0 = vi->refIdx[0]; int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; int luma_w0 = shdr->LumaWeight[0][refIdx0]; int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L)); int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0]; int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C)); int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1]; int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C)); logtrace(LogMotion,"weighted-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH); ctx->acceleration.put_weighted_pred(pixels[0], stride[0], predSamplesL[0],nCS, nPbW,nPbH, luma_w0, luma_o0, luma_log2WD, bit_depth_L); if (img->get_chroma_format() != de265_chroma_mono) { ctx->acceleration.put_weighted_pred(pixels[1], stride[1], predSamplesC[0][0], nCS, nPbW / SubWidthC, nPbH / SubHeightC, chroma0_w0, chroma0_o0, chroma_log2WD, bit_depth_C); ctx->acceleration.put_weighted_pred(pixels[2], stride[2], predSamplesC[1][0], nCS, nPbW / SubWidthC, nPbH / SubHeightC, chroma1_w0, chroma1_o0, chroma_log2WD, bit_depth_C); } } else { ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); img->integrity = INTEGRITY_DECODING_ERRORS; } } } else { assert(shdr->slice_type == SLICE_TYPE_B); if (predFlag[0]==1 && predFlag[1]==1) { if (pps->weighted_bipred_flag==0) { //const int shift2 = 15-8; // TODO: real bit depth //const int offset2 = 1<<(shift2-1); int16_t* in0 = predSamplesL[0]; int16_t* in1 = predSamplesL[1]; ctx->acceleration.put_weighted_pred_avg(pixels[0], stride[0], in0,in1, nCS, nPbW, nPbH, bit_depth_L); int16_t* in00 = predSamplesC[0][0]; int16_t* in01 = predSamplesC[0][1]; int16_t* in10 = predSamplesC[1][0]; int16_t* in11 = predSamplesC[1][1]; if (img->get_chroma_format() != de265_chroma_mono) { ctx->acceleration.put_weighted_pred_avg(pixels[1], stride[1], in00, in01, nCS, nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); ctx->acceleration.put_weighted_pred_avg(pixels[2], stride[2], in10, in11, nCS, nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); } } else { // weighted prediction int refIdx0 = vi->refIdx[0]; int refIdx1 = vi->refIdx[1]; int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; int luma_w0 = shdr->LumaWeight[0][refIdx0]; int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L)); int luma_w1 = shdr->LumaWeight[1][refIdx1]; int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(offset_shift1_L)); int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0]; int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C)); int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1]; int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C)); int chroma0_w1 = shdr->ChromaWeight[1][refIdx1][0]; int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(offset_shift1_C)); int chroma1_w1 = shdr->ChromaWeight[1][refIdx1][1]; int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(offset_shift1_C)); logtrace(LogMotion,"weighted-BI-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH); logtrace(LogMotion,"weighted-BI-1 [%d] %d %d %d %dx%d\n", refIdx1, luma_log2WD-6,luma_w1,luma_o1,nPbW,nPbH); int16_t* in0 = predSamplesL[0]; int16_t* in1 = predSamplesL[1]; ctx->acceleration.put_weighted_bipred(pixels[0], stride[0], in0,in1, nCS, nPbW, nPbH, luma_w0,luma_o0, luma_w1,luma_o1, luma_log2WD, bit_depth_L); int16_t* in00 = predSamplesC[0][0]; int16_t* in01 = predSamplesC[0][1]; int16_t* in10 = predSamplesC[1][0]; int16_t* in11 = predSamplesC[1][1]; if (img->get_chroma_format() != de265_chroma_mono) { ctx->acceleration.put_weighted_bipred(pixels[1], stride[1], in00, in01, nCS, nPbW / SubWidthC, nPbH / SubHeightC, chroma0_w0, chroma0_o0, chroma0_w1, chroma0_o1, chroma_log2WD, bit_depth_C); ctx->acceleration.put_weighted_bipred(pixels[2], stride[2], in10, in11, nCS, nPbW / SubWidthC, nPbH / SubHeightC, chroma1_w0, chroma1_o0, chroma1_w1, chroma1_o1, chroma_log2WD, bit_depth_C); } } } else if (predFlag[0]==1 || predFlag[1]==1) { int l = predFlag[0] ? 0 : 1; if (pps->weighted_bipred_flag==0) { ctx->acceleration.put_unweighted_pred(pixels[0], stride[0], predSamplesL[l],nCS, nPbW,nPbH, bit_depth_L); if (img->get_chroma_format() != de265_chroma_mono) { ctx->acceleration.put_unweighted_pred(pixels[1], stride[1], predSamplesC[0][l], nCS, nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); ctx->acceleration.put_unweighted_pred(pixels[2], stride[2], predSamplesC[1][l], nCS, nPbW / SubWidthC, nPbH / SubHeightC, bit_depth_C); } } else { int refIdx = vi->refIdx[l]; int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; int luma_w = shdr->LumaWeight[l][refIdx]; int luma_o = shdr->luma_offset[l][refIdx] * (1<<(offset_shift1_L)); int chroma0_w = shdr->ChromaWeight[l][refIdx][0]; int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(offset_shift1_C)); int chroma1_w = shdr->ChromaWeight[l][refIdx][1]; int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(offset_shift1_C)); logtrace(LogMotion,"weighted-B-L%d [%d] %d %d %d %dx%d\n", l, refIdx, luma_log2WD-6,luma_w,luma_o,nPbW,nPbH); ctx->acceleration.put_weighted_pred(pixels[0], stride[0], predSamplesL[l],nCS, nPbW,nPbH, luma_w, luma_o, luma_log2WD, bit_depth_L); if (img->get_chroma_format() != de265_chroma_mono) { ctx->acceleration.put_weighted_pred(pixels[1], stride[1], predSamplesC[0][l], nCS, nPbW / SubWidthC, nPbH / SubHeightC, chroma0_w, chroma0_o, chroma_log2WD, bit_depth_C); ctx->acceleration.put_weighted_pred(pixels[2], stride[2], predSamplesC[1][l], nCS, nPbW / SubWidthC, nPbH / SubHeightC, chroma1_w, chroma1_o, chroma_log2WD, bit_depth_C); } } } else { // TODO: check why it can actually happen that both predFlags[] are false. // For now, we ignore this and continue decoding. ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); img->integrity = INTEGRITY_DECODING_ERRORS; } } #if defined(DE265_LOG_TRACE) && 0 logtrace(LogTransform,"MC pixels (luma), position %d %d:\n", xP,yP); for (int y=0;yget_PartMode(x,y); } const PBMotion& get_mv_info(int x,int y) const override { return img->get_mv_info(x,y); } private: const de265_image* img; }; /* +--+ +--+--+ |B2| |B1|B0| +--+----------------+--+--+ | | | | | | | | | PB | | | | | +--+ | |A1| | +--+-------------------+ |A0| +--+ */ // 8.5.3.1.2 // TODO: check: can we fill the candidate list directly in this function and omit to copy later /* xC/yC: CB position nCS: CB size (probably modified because of singleMCLFlag) xP/yP: PB position (absolute) (probably modified because of singleMCLFlag) singleMCLFlag nPbW/nPbH: PB size partIdx out_cand: merging candidate vectors Add these candidates: - A1 - B1 (if != A1) - B0 (if != B1) - A0 (if != A1) - B2 (if != A1 and != B1) A maximum of 4 candidates are generated. Note 1: For a CB split into two PBs, it does not make sense to merge the second part to the parameters of the first part, since then, we could use 2Nx2N right away. -> Exclude this candidate. */ int derive_spatial_merging_candidates(//const de265_image* img, const MotionVectorAccess& mvaccess, const de265_image* img, int xC, int yC, int nCS, int xP, int yP, uint8_t singleMCLFlag, int nPbW, int nPbH, int partIdx, PBMotion* out_cand, int maxCandidates) { const pic_parameter_set* pps = &img->get_pps(); const int log2_parallel_merge_level = pps->log2_parallel_merge_level; enum PartMode PartMode = mvaccess.get_PartMode(xC,yC); /* const int A0 = SpatialMergingCandidates::PRED_A0; const int A1 = SpatialMergingCandidates::PRED_A1; const int B0 = SpatialMergingCandidates::PRED_B0; const int B1 = SpatialMergingCandidates::PRED_B1; const int B2 = SpatialMergingCandidates::PRED_B2; */ // --- A1 --- // a pixel within A1 (bottom right of A1) int xA1 = xP-1; int yA1 = yP+nPbH-1; bool availableA1; int idxA1; int computed_candidates = 0; // check if candidate is in same motion-estimation region (MER) -> discard if ((xP>>log2_parallel_merge_level) == (xA1>>log2_parallel_merge_level) && (yP>>log2_parallel_merge_level) == (yA1>>log2_parallel_merge_level)) { availableA1 = false; logtrace(LogMotion,"spatial merging candidate A1: below parallel merge level\n"); } // redundant candidate? (Note 1) -> discard else if (// !singleMCLFlag && automatically true when partIdx==1 partIdx==1 && (PartMode==PART_Nx2N || PartMode==PART_nLx2N || PartMode==PART_nRx2N)) { availableA1 = false; logtrace(LogMotion,"spatial merging candidate A1: second part ignore\n"); } // MV available in A1 else { availableA1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA1,yA1); if (!availableA1) logtrace(LogMotion,"spatial merging candidate A1: unavailable\n"); } if (availableA1) { idxA1 = computed_candidates++; out_cand[idxA1] = mvaccess.get_mv_info(xA1,yA1); logtrace(LogMotion,"spatial merging candidate A1:\n"); logmvcand(out_cand[idxA1]); } if (computed_candidates>=maxCandidates) return computed_candidates; // --- B1 --- int xB1 = xP+nPbW-1; int yB1 = yP-1; bool availableB1; int idxB1; // same MER -> discard if ((xP>>log2_parallel_merge_level) == (xB1>>log2_parallel_merge_level) && (yP>>log2_parallel_merge_level) == (yB1>>log2_parallel_merge_level)) { availableB1 = false; logtrace(LogMotion,"spatial merging candidate B1: below parallel merge level\n"); } // redundant candidate (Note 1) -> discard else if (// !singleMCLFlag && automatically true when partIdx==1 partIdx==1 && (PartMode==PART_2NxN || PartMode==PART_2NxnU || PartMode==PART_2NxnD)) { availableB1 = false; logtrace(LogMotion,"spatial merging candidate B1: second part ignore\n"); } // MV available in B1 else { availableB1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB1,yB1); if (!availableB1) logtrace(LogMotion,"spatial merging candidate B1: unavailable\n"); } if (availableB1) { const PBMotion& b1 = img->get_mv_info(xB1,yB1); // B1 == A1 -> discard B1 if (availableA1 && out_cand[idxA1] == b1) { idxB1 = idxA1; logtrace(LogMotion,"spatial merging candidate B1: redundant to A1\n"); } else { idxB1 = computed_candidates++; out_cand[idxB1] = b1; logtrace(LogMotion,"spatial merging candidate B1:\n"); logmvcand(out_cand[idxB1]); } } if (computed_candidates>=maxCandidates) return computed_candidates; // --- B0 --- int xB0 = xP+nPbW; int yB0 = yP-1; bool availableB0; int idxB0; if ((xP>>log2_parallel_merge_level) == (xB0>>log2_parallel_merge_level) && (yP>>log2_parallel_merge_level) == (yB0>>log2_parallel_merge_level)) { availableB0 = false; logtrace(LogMotion,"spatial merging candidate B0: below parallel merge level\n"); } else { availableB0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB0,yB0); if (!availableB0) logtrace(LogMotion,"spatial merging candidate B0: unavailable\n"); } if (availableB0) { const PBMotion& b0 = img->get_mv_info(xB0,yB0); // B0 == B1 -> discard B0 if (availableB1 && out_cand[idxB1]==b0) { idxB0 = idxB1; logtrace(LogMotion,"spatial merging candidate B0: redundant to B1\n"); } else { idxB0 = computed_candidates++; out_cand[idxB0] = b0; logtrace(LogMotion,"spatial merging candidate B0:\n"); logmvcand(out_cand[idxB0]); } } if (computed_candidates>=maxCandidates) return computed_candidates; // --- A0 --- int xA0 = xP-1; int yA0 = yP+nPbH; bool availableA0; int idxA0; if ((xP>>log2_parallel_merge_level) == (xA0>>log2_parallel_merge_level) && (yP>>log2_parallel_merge_level) == (yA0>>log2_parallel_merge_level)) { availableA0 = false; logtrace(LogMotion,"spatial merging candidate A0: below parallel merge level\n"); } else { availableA0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA0,yA0); if (!availableA0) logtrace(LogMotion,"spatial merging candidate A0: unavailable\n"); } if (availableA0) { const PBMotion& a0 = img->get_mv_info(xA0,yA0); // A0 == A1 -> discard A0 if (availableA1 && out_cand[idxA1]==a0) { idxA0 = idxA1; logtrace(LogMotion,"spatial merging candidate A0: redundant to A1\n"); } else { idxA0 = computed_candidates++; out_cand[idxA0] = a0; logtrace(LogMotion,"spatial merging candidate A0:\n"); logmvcand(out_cand[idxA0]); } } if (computed_candidates>=maxCandidates) return computed_candidates; // --- B2 --- int xB2 = xP-1; int yB2 = yP-1; bool availableB2; int idxB2; // if we already have four candidates, do not consider B2 anymore if (computed_candidates==4) { availableB2 = false; logtrace(LogMotion,"spatial merging candidate B2: ignore\n"); } else if ((xP>>log2_parallel_merge_level) == (xB2>>log2_parallel_merge_level) && (yP>>log2_parallel_merge_level) == (yB2>>log2_parallel_merge_level)) { availableB2 = false; logtrace(LogMotion,"spatial merging candidate B2: below parallel merge level\n"); } else { availableB2 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB2,yB2); if (!availableB2) logtrace(LogMotion,"spatial merging candidate B2: unavailable\n"); } if (availableB2) { const PBMotion& b2 = img->get_mv_info(xB2,yB2); // B2 == B1 -> discard B2 if (availableB1 && out_cand[idxB1]==b2) { idxB2 = idxB1; logtrace(LogMotion,"spatial merging candidate B2: redundant to B1\n"); } // B2 == A1 -> discard B2 else if (availableA1 && out_cand[idxA1]==b2) { idxB2 = idxA1; logtrace(LogMotion,"spatial merging candidate B2: redundant to A1\n"); } else { idxB2 = computed_candidates++; out_cand[idxB2] = b2; logtrace(LogMotion,"spatial merging candidate B2:\n"); logmvcand(out_cand[idxB2]); } } return computed_candidates; } // 8.5.3.1.4 void derive_zero_motion_vector_candidates(const slice_segment_header* shdr, PBMotion* out_mergeCandList, int* inout_numCurrMergeCand, int maxCandidates) { logtrace(LogMotion,"derive_zero_motion_vector_candidates\n"); int numRefIdx; if (shdr->slice_type==SLICE_TYPE_P) { numRefIdx = shdr->num_ref_idx_l0_active; } else { numRefIdx = libde265_min(shdr->num_ref_idx_l0_active, shdr->num_ref_idx_l1_active); } //int numInputMergeCand = *inout_numMergeCand; int zeroIdx = 0; while (*inout_numCurrMergeCand < maxCandidates) { // 1. logtrace(LogMotion,"zeroIdx:%d numRefIdx:%d\n", zeroIdx, numRefIdx); PBMotion* newCand = &out_mergeCandList[*inout_numCurrMergeCand]; const int refIdx = (zeroIdx < numRefIdx) ? zeroIdx : 0; if (shdr->slice_type==SLICE_TYPE_P) { newCand->refIdx[0] = refIdx; newCand->refIdx[1] = 0; newCand->predFlag[0] = 1; newCand->predFlag[1] = 0; } else { newCand->refIdx[0] = refIdx; newCand->refIdx[1] = refIdx; newCand->predFlag[0] = 1; newCand->predFlag[1] = 1; } newCand->mv[0].x = 0; newCand->mv[0].y = 0; newCand->mv[1].x = 0; newCand->mv[1].y = 0; (*inout_numCurrMergeCand)++; // 2. zeroIdx++; } } bool scale_mv(MotionVector* out_mv, MotionVector mv, int colDist, int currDist) { int td = Clip3(-128,127, colDist); int tb = Clip3(-128,127, currDist); if (td==0) { *out_mv = mv; return false; } else { int tx = (16384 + (abs_value(td)>>1)) / td; int distScaleFactor = Clip3(-4096,4095, (tb*tx+32)>>6); out_mv->x = Clip3(-32768,32767, Sign(distScaleFactor*mv.x)*((abs_value(distScaleFactor*mv.x)+127)>>8)); out_mv->y = Clip3(-32768,32767, Sign(distScaleFactor*mv.y)*((abs_value(distScaleFactor*mv.y)+127)>>8)); return true; } } // (L1003) 8.5.3.2.8 void derive_collocated_motion_vectors(base_context* ctx, de265_image* img, const slice_segment_header* shdr, int xP,int yP, int colPic, int xColPb,int yColPb, int refIdxLX, // (always 0 for merge mode) int X, MotionVector* out_mvLXCol, uint8_t* out_availableFlagLXCol) { logtrace(LogMotion,"derive_collocated_motion_vectors %d;%d\n",xP,yP); // get collocated image and the prediction mode at the collocated position assert(ctx->has_image(colPic)); const de265_image* colImg = ctx->get_image(colPic); // check for access outside image area if (xColPb >= colImg->get_width() || yColPb >= colImg->get_height()) { ctx->add_warning(DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA, false); *out_availableFlagLXCol = 0; return; } enum PredMode predMode = colImg->get_pred_mode(xColPb,yColPb); // collocated block is Intra -> no collocated MV if (predMode == MODE_INTRA) { out_mvLXCol->x = 0; out_mvLXCol->y = 0; *out_availableFlagLXCol = 0; return; } logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n", colPic, colImg->PicOrderCntVal, X,refIdxLX,shdr->RefPicList[X][refIdxLX]); // collocated reference image is unavailable -> no collocated MV if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) { out_mvLXCol->x = 0; out_mvLXCol->y = 0; *out_availableFlagLXCol = 0; return; } // get the collocated MV const PBMotion& mvi = colImg->get_mv_info(xColPb,yColPb); int listCol; int refIdxCol; MotionVector mvCol; logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb); logmvcand(mvi); // collocated MV uses only L1 -> use L1 if (mvi.predFlag[0]==0) { mvCol = mvi.mv[1]; refIdxCol = mvi.refIdx[1]; listCol = 1; } // collocated MV uses only L0 -> use L0 else if (mvi.predFlag[1]==0) { mvCol = mvi.mv[0]; refIdxCol = mvi.refIdx[0]; listCol = 0; } // collocated MV uses L0 and L1 else { bool allRefFramesBeforeCurrentFrame = true; const int currentPOC = img->PicOrderCntVal; // all reference POCs earlier than current POC (list 1) // Test L1 first, because there is a higher change to find a future reference frame. for (int rIdx=0; rIdxnum_ref_idx_l1_active && allRefFramesBeforeCurrentFrame; rIdx++) { const de265_image* refimg = ctx->get_image(shdr->RefPicList[1][rIdx]); int refPOC = refimg->PicOrderCntVal; if (refPOC > currentPOC) { allRefFramesBeforeCurrentFrame = false; } } // all reference POCs earlier than current POC (list 0) for (int rIdx=0; rIdxnum_ref_idx_l0_active && allRefFramesBeforeCurrentFrame; rIdx++) { const de265_image* refimg = ctx->get_image(shdr->RefPicList[0][rIdx]); int refPOC = refimg->PicOrderCntVal; if (refPOC > currentPOC) { allRefFramesBeforeCurrentFrame = false; } } /* TODO: What is the rationale behind this ??? My guess: when there are images before the current frame (most probably in L0) and images after the current frame (most probably in L1), we take the reference in the opposite direction than where the collocated frame is positioned in the hope that the distance to the current frame will be smaller and thus give a better prediction. If all references point into the past, we cannot say much about the temporal order or L0,L1 and thus take over both parts. */ if (allRefFramesBeforeCurrentFrame) { mvCol = mvi.mv[X]; refIdxCol = mvi.refIdx[X]; listCol = X; } else { int N = shdr->collocated_from_l0_flag; mvCol = mvi.mv[N]; refIdxCol = mvi.refIdx[N]; listCol = N; } } uint16_t slice_hdr_idx = colImg->get_SliceHeaderIndex(xColPb,yColPb); if (slice_hdr_idx >= colImg->slices.size()) { ctx->add_warning(DE265_WARNING_INVALID_SLICE_HEADER_INDEX_ACCESS, false); *out_availableFlagLXCol = 0; out_mvLXCol->x = 0; out_mvLXCol->y = 0; return; } const slice_segment_header* colShdr = colImg->slices[ colImg->get_SliceHeaderIndex(xColPb,yColPb) ]; if (shdr->LongTermRefPic[X][refIdxLX] != colShdr->LongTermRefPic[listCol][refIdxCol]) { *out_availableFlagLXCol = 0; out_mvLXCol->x = 0; out_mvLXCol->y = 0; } else { *out_availableFlagLXCol = 1; const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX]; int colDist = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol]; int currDist = img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX]; logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist, colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol], img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX] ); if (isLongTerm || colDist == currDist) { *out_mvLXCol = mvCol; } else { if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) { ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); img->integrity = INTEGRITY_DECODING_ERRORS; } logtrace(LogMotion,"scale: %d;%d to %d;%d\n", mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y); } } } // 8.5.3.1.7 void derive_temporal_luma_vector_prediction(base_context* ctx, de265_image* img, const slice_segment_header* shdr, int xP,int yP, int nPbW,int nPbH, int refIdxL, int X, // which MV (L0/L1) to get MotionVector* out_mvLXCol, uint8_t* out_availableFlagLXCol) { // --- no temporal MVP -> exit --- if (shdr->slice_temporal_mvp_enabled_flag == 0) { out_mvLXCol->x = 0; out_mvLXCol->y = 0; *out_availableFlagLXCol = 0; return; } // --- find collocated reference image --- int Log2CtbSizeY = img->get_sps().Log2CtbSizeY; int colPic; // TODO: this is the same for the whole slice. We can precompute it. if (shdr->slice_type == SLICE_TYPE_B && shdr->collocated_from_l0_flag == 0) { logtrace(LogMotion,"collocated L1 ref_idx=%d\n",shdr->collocated_ref_idx); colPic = shdr->RefPicList[1][ shdr->collocated_ref_idx ]; } else { logtrace(LogMotion,"collocated L0 ref_idx=%d\n",shdr->collocated_ref_idx); colPic = shdr->RefPicList[0][ shdr->collocated_ref_idx ]; } // check whether collocated reference picture exists if (!ctx->has_image(colPic)) { out_mvLXCol->x = 0; out_mvLXCol->y = 0; *out_availableFlagLXCol = 0; ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); return; } // --- get collocated MV either at bottom-right corner or from center of PB --- int xColPb,yColPb; int yColBr = yP + nPbH; // bottom right collocated motion vector position int xColBr = xP + nPbW; /* If neighboring pixel at bottom-right corner is in the same CTB-row and inside the image, use this (reduced down to 16 pixels resolution) as collocated MV position. Note: see 2014, Sze, Sect. 5.2.1.2 why candidate C0 is excluded when on another CTB-row. This is to reduce the memory bandwidth requirements. */ if ((yP>>Log2CtbSizeY) == (yColBr>>Log2CtbSizeY) && xColBr < img->get_sps().pic_width_in_luma_samples && yColBr < img->get_sps().pic_height_in_luma_samples) { xColPb = xColBr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid yColPb = yColBr & ~0x0F; derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X, out_mvLXCol, out_availableFlagLXCol); } else { out_mvLXCol->x = 0; out_mvLXCol->y = 0; *out_availableFlagLXCol = 0; } if (*out_availableFlagLXCol==0) { int xColCtr = xP+(nPbW>>1); int yColCtr = yP+(nPbH>>1); xColPb = xColCtr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid yColPb = yColCtr & ~0x0F; derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X, out_mvLXCol, out_availableFlagLXCol); } } static int table_8_19[2][12] = { { 0,1,0,2,1,2,0,3,1,3,2,3 }, { 1,0,2,0,2,1,3,0,3,1,3,2 } }; // 8.5.3.1.3 /* Note (TODO): during decoding, we know which of the candidates we will select. + Hence, we do not really have to generate the other ones... + */ void derive_combined_bipredictive_merging_candidates(const base_context* ctx, const slice_segment_header* shdr, PBMotion* inout_mergeCandList, int* inout_numMergeCand, int maxCandidates) { if (*inout_numMergeCand>1 && *inout_numMergeCand < maxCandidates) { int numOrigMergeCand = *inout_numMergeCand; int numInputMergeCand = *inout_numMergeCand; int combIdx = 0; uint8_t combStop = false; while (!combStop) { int l0CandIdx = table_8_19[0][combIdx]; int l1CandIdx = table_8_19[1][combIdx]; if (l0CandIdx >= numInputMergeCand || l1CandIdx >= numInputMergeCand) { assert(false); // bitstream error -> TODO: conceal error } PBMotion& l0Cand = inout_mergeCandList[l0CandIdx]; PBMotion& l1Cand = inout_mergeCandList[l1CandIdx]; logtrace(LogMotion,"add bipredictive merging candidate (combIdx:%d)\n",combIdx); logtrace(LogMotion,"l0Cand:\n"); logmvcand(l0Cand); logtrace(LogMotion,"l1Cand:\n"); logmvcand(l1Cand); const de265_image* img0 = l0Cand.predFlag[0] ? ctx->get_image(shdr->RefPicList[0][l0Cand.refIdx[0]]) : nullptr; const de265_image* img1 = l1Cand.predFlag[1] ? ctx->get_image(shdr->RefPicList[1][l1Cand.refIdx[1]]) : nullptr; if (l0Cand.predFlag[0] && !img0) { return; // TODO error } if (l1Cand.predFlag[1] && !img1) { return; // TODO error } if (l0Cand.predFlag[0] && l1Cand.predFlag[1] && (img0->PicOrderCntVal != img1->PicOrderCntVal || l0Cand.mv[0].x != l1Cand.mv[1].x || l0Cand.mv[0].y != l1Cand.mv[1].y)) { PBMotion& p = inout_mergeCandList[ *inout_numMergeCand ]; p.refIdx[0] = l0Cand.refIdx[0]; p.refIdx[1] = l1Cand.refIdx[1]; p.predFlag[0] = l0Cand.predFlag[0]; p.predFlag[1] = l1Cand.predFlag[1]; p.mv[0] = l0Cand.mv[0]; p.mv[1] = l1Cand.mv[1]; (*inout_numMergeCand)++; logtrace(LogMotion,"result:\n"); logmvcand(p); } combIdx++; if (combIdx == numOrigMergeCand*(numOrigMergeCand-1) || *inout_numMergeCand == maxCandidates) { combStop = true; } } } } // 8.5.3.1.1 void get_merge_candidate_list_without_step_9(base_context* ctx, const slice_segment_header* shdr, const MotionVectorAccess& mvaccess, de265_image* img, int xC,int yC, int xP,int yP, int nCS, int nPbW,int nPbH, int partIdx, int max_merge_idx, PBMotion* mergeCandList) { //int xOrigP = xP; //int yOrigP = yP; //int nOrigPbW = nPbW; //int nOrigPbH = nPbH; int singleMCLFlag; // single merge-candidate-list (MCL) flag /* Use single MCL for CBs of size 8x8, except when parallel-merge-level is at 4x4. Without this flag, PBs smaller than 8x8 would not receive as much merging candidates. Having additional candidates might have these advantages: - coding MVs for these small PBs is expensive, and - since the PBs are not far away from a proper (neighboring) merging candidate, the quality of the candidates will still be good. */ singleMCLFlag = (img->get_pps().log2_parallel_merge_level > 2 && nCS==8); if (singleMCLFlag) { xP=xC; yP=yC; nPbW=nCS; nPbH=nCS; partIdx=0; } int maxCandidates = max_merge_idx+1; //MotionVectorSpec mergeCandList[5]; int numMergeCand=0; // --- spatial merge candidates numMergeCand = derive_spatial_merging_candidates(mvaccess, img, xC,yC, nCS, xP,yP, singleMCLFlag, nPbW,nPbH,partIdx, mergeCandList, maxCandidates); // --- collocated merge candidate if (numMergeCand < maxCandidates) { int refIdxCol[2] = { 0,0 }; MotionVector mvCol[2]; uint8_t predFlagLCol[2]; derive_temporal_luma_vector_prediction(ctx,img,shdr, xP,yP,nPbW,nPbH, refIdxCol[0],0, &mvCol[0], &predFlagLCol[0]); uint8_t availableFlagCol = predFlagLCol[0]; predFlagLCol[1] = 0; if (shdr->slice_type == SLICE_TYPE_B) { derive_temporal_luma_vector_prediction(ctx,img,shdr, xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1], &predFlagLCol[1]); availableFlagCol |= predFlagLCol[1]; } if (availableFlagCol) { PBMotion* colVec = &mergeCandList[numMergeCand++]; colVec->mv[0] = mvCol[0]; colVec->mv[1] = mvCol[1]; colVec->predFlag[0] = predFlagLCol[0]; colVec->predFlag[1] = predFlagLCol[1]; colVec->refIdx[0] = refIdxCol[0]; colVec->refIdx[1] = refIdxCol[1]; } } // --- bipredictive merge candidates --- if (shdr->slice_type == SLICE_TYPE_B) { derive_combined_bipredictive_merging_candidates(ctx, shdr, mergeCandList, &numMergeCand, maxCandidates); } // --- zero-vector merge candidates --- derive_zero_motion_vector_candidates(shdr, mergeCandList, &numMergeCand, maxCandidates); logtrace(LogMotion,"mergeCandList:\n"); for (int i=0;iMaxNumMergeCand;i++) { //logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":""); logmvcand(mergeCandList[i]); } } void get_merge_candidate_list(base_context* ctx, const slice_segment_header* shdr, de265_image* img, int xC,int yC, int xP,int yP, int nCS, int nPbW,int nPbH, int partIdx, PBMotion* mergeCandList) { int max_merge_idx = 5-shdr->five_minus_max_num_merge_cand -1; get_merge_candidate_list_without_step_9(ctx, shdr, MotionVectorAccess_de265_image(img), img, xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx, max_merge_idx, mergeCandList); // 9. for encoder: modify all merge candidates for (int i=0;i<=max_merge_idx;i++) { if (mergeCandList[i].predFlag[0] && mergeCandList[i].predFlag[1] && nPbW+nPbH==12) { mergeCandList[i].refIdx[1] = 0; mergeCandList[i].predFlag[1] = 0; } } } void derive_luma_motion_merge_mode(base_context* ctx, const slice_segment_header* shdr, de265_image* img, int xC,int yC, int xP,int yP, int nCS, int nPbW,int nPbH, int partIdx, int merge_idx, PBMotion* out_vi) { PBMotion mergeCandList[5]; get_merge_candidate_list_without_step_9(ctx, shdr, MotionVectorAccess_de265_image(img), img, xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx, merge_idx, mergeCandList); *out_vi = mergeCandList[merge_idx]; // 8.5.3.1.1 / 9. if (out_vi->predFlag[0] && out_vi->predFlag[1] && nPbW+nPbH==12) { out_vi->refIdx[1] = 0; out_vi->predFlag[1] = 0; } } // 8.5.3.1.6 void derive_spatial_luma_vector_prediction(base_context* ctx, de265_image* img, const slice_segment_header* shdr, int xC,int yC,int nCS,int xP,int yP, int nPbW,int nPbH, int X, int refIdxLX, int partIdx, uint8_t out_availableFlagLXN[2], MotionVector out_mvLXN[2]) { if (refIdxLX >= MAX_NUM_REF_PICS) { ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); img->integrity = INTEGRITY_DECODING_ERRORS; out_availableFlagLXN[0] = false; out_availableFlagLXN[1] = false; out_mvLXN[0] = MotionVector(); out_mvLXN[1] = MotionVector(); return; } int isScaledFlagLX = 0; const int A=0; const int B=1; out_availableFlagLXN[A] = 0; out_availableFlagLXN[B] = 0; // --- A --- // 1. int xA[2], yA[2]; xA[0] = xP-1; yA[0] = yP + nPbH; xA[1] = xA[0]; yA[1] = yA[0]-1; // 2. out_availableFlagLXN[A] = 0; out_mvLXN[A].x = 0; out_mvLXN[A].y = 0; // 3. / 4. bool availableA[2]; availableA[0] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[0],yA[0]); availableA[1] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[1],yA[1]); // 5. if (availableA[0] || availableA[1]) { isScaledFlagLX = 1; } // 6. test A0 and A1 (Ak) int refIdxA=-1; // the POC we want to reference in this PB const de265_image* tmpimg = ctx->get_image(shdr->RefPicList[X][ refIdxLX ]); if (tmpimg==nullptr) { return; } const int referenced_POC = tmpimg->PicOrderCntVal; for (int k=0;k<=1;k++) { if (availableA[k] && out_availableFlagLXN[A]==0 && // no A?-predictor so far img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) { int Y=1-X; const PBMotion& vi = img->get_mv_info(xA[k],yA[k]); logtrace(LogMotion,"MVP A%d=\n",k); logmvcand(vi); const de265_image* imgX = nullptr; if (vi.predFlag[X]) { imgX = ctx->get_image(shdr->RefPicList[X][ vi.refIdx[X] ]); } const de265_image* imgY = nullptr; if (vi.predFlag[Y]) { imgY = ctx->get_image(shdr->RefPicList[Y][ vi.refIdx[Y] ]); } // check whether the predictor X is available and references the same POC if (vi.predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) { logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,X); out_availableFlagLXN[A]=1; out_mvLXN[A] = vi.mv[X]; refIdxA = vi.refIdx[X]; } // check whether the other predictor (Y) is available and references the same POC else if (vi.predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) { logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,Y); out_availableFlagLXN[A]=1; out_mvLXN[A] = vi.mv[Y]; refIdxA = vi.refIdx[Y]; } } } // 7. If there is no predictor referencing the same POC, we take any other reference as // long as it is the same type of reference (long-term / short-term) for (int k=0 ; k<=1 && out_availableFlagLXN[A]==0 ; k++) { int refPicList=-1; if (availableA[k] && // TODO: we could remove this call by storing the result of the similar computation above img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) { int Y=1-X; const PBMotion& vi = img->get_mv_info(xA[k],yA[k]); if (vi.predFlag[X]==1 && shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi.refIdx[X] ]) { logtrace(LogMotion,"take A%D/L%d as A candidate with different POCs\n",k,X); out_availableFlagLXN[A]=1; out_mvLXN[A] = vi.mv[X]; refIdxA = vi.refIdx[X]; refPicList = X; } else if (vi.predFlag[Y]==1 && shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi.refIdx[Y] ]) { logtrace(LogMotion,"take A%d/L%d as A candidate with different POCs\n",k,Y); out_availableFlagLXN[A]=1; out_mvLXN[A] = vi.mv[Y]; refIdxA = vi.refIdx[Y]; refPicList = Y; } } if (out_availableFlagLXN[A]==1) { if (refIdxA<0) { out_availableFlagLXN[0] = out_availableFlagLXN[1] = false; return; // error } assert(refIdxA>=0); assert(refPicList>=0); const de265_image* refPicA = ctx->get_image(shdr->RefPicList[refPicList][refIdxA ]); #ifdef DE265_LOG_TRACE const de265_image* refPicX = ctx->get_image(shdr->RefPicList[X][refIdxLX]); #endif //int picStateA = shdr->RefPicList_PicState[refPicList][refIdxA ]; //int picStateX = shdr->RefPicList_PicState[X ][refIdxLX]; int isLongTermA = shdr->LongTermRefPic[refPicList][refIdxA ]; int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX]; logtrace(LogMotion,"scale MVP A: A-POC:%d X-POC:%d\n", refPicA->PicOrderCntVal,refPicX->PicOrderCntVal); if (!isLongTermA && !isLongTermX) /* if (picStateA == UsedForShortTermReference && picStateX == UsedForShortTermReference) */ { int distA = img->PicOrderCntVal - refPicA->PicOrderCntVal; int distX = img->PicOrderCntVal - referenced_POC; if (!scale_mv(&out_mvLXN[A], out_mvLXN[A], distA, distX)) { ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); img->integrity = INTEGRITY_DECODING_ERRORS; } } } } // --- B --- // 1. int xB[3], yB[3]; xB[0] = xP+nPbW; yB[0] = yP-1; xB[1] = xB[0]-1; yB[1] = yP-1; xB[2] = xP-1; yB[2] = yP-1; // 2. out_availableFlagLXN[B] = 0; out_mvLXN[B].x = 0; out_mvLXN[B].y = 0; // 3. test B0,B1,B2 (Bk) int refIdxB=-1; bool availableB[3]; for (int k=0;k<3;k++) { availableB[k] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB[k],yB[k]); if (availableB[k] && out_availableFlagLXN[B]==0) { int Y=1-X; const PBMotion& vi = img->get_mv_info(xB[k],yB[k]); logtrace(LogMotion,"MVP B%d=\n",k); logmvcand(vi); const de265_image* imgX = nullptr; if (vi.predFlag[X]) { imgX = ctx->get_image(shdr->RefPicList[X][ vi.refIdx[X] ]); } const de265_image* imgY = nullptr; if (vi.predFlag[Y]) { imgY = ctx->get_image(shdr->RefPicList[Y][ vi.refIdx[Y] ]); } if (vi.predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) { logtrace(LogMotion,"a) take B%d/L%d as B candidate with same POC\n",k,X); out_availableFlagLXN[B]=1; out_mvLXN[B] = vi.mv[X]; refIdxB = vi.refIdx[X]; } else if (vi.predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) { logtrace(LogMotion,"b) take B%d/L%d as B candidate with same POC\n",k,Y); out_availableFlagLXN[B]=1; out_mvLXN[B] = vi.mv[Y]; refIdxB = vi.refIdx[Y]; } } } // 4. if (isScaledFlagLX==0 && // no A predictor, out_availableFlagLXN[B]) // but an unscaled B predictor { // use unscaled B predictor as A predictor logtrace(LogMotion,"copy the same-POC B candidate as additional A candidate\n"); out_availableFlagLXN[A]=1; out_mvLXN[A] = out_mvLXN[B]; refIdxA = refIdxB; } // 5. // If no A predictor, we output the unscaled B as the A predictor (above) // and also add a scaled B predictor here. // If there is (probably) an A predictor, no differing-POC B predictor is generated. if (isScaledFlagLX==0) { out_availableFlagLXN[B]=0; for (int k=0 ; k<=2 && out_availableFlagLXN[B]==0 ; k++) { int refPicList=-1; if (availableB[k]) { int Y=1-X; const PBMotion& vi = img->get_mv_info(xB[k],yB[k]); if (vi.predFlag[X]==1 && shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi.refIdx[X] ]) { out_availableFlagLXN[B]=1; out_mvLXN[B] = vi.mv[X]; refIdxB = vi.refIdx[X]; refPicList = X; } else if (vi.predFlag[Y]==1 && shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi.refIdx[Y] ]) { out_availableFlagLXN[B]=1; out_mvLXN[B] = vi.mv[Y]; refIdxB = vi.refIdx[Y]; refPicList = Y; } } if (out_availableFlagLXN[B]==1) { if (refIdxB<0) { out_availableFlagLXN[0] = out_availableFlagLXN[1] = false; return; // error } assert(refPicList>=0); assert(refIdxB>=0); const de265_image* refPicB=ctx->get_image(shdr->RefPicList[refPicList][refIdxB ]); const de265_image* refPicX=ctx->get_image(shdr->RefPicList[X ][refIdxLX]); int isLongTermB = shdr->LongTermRefPic[refPicList][refIdxB ]; int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX]; if (refPicB==nullptr || refPicX==nullptr) { img->decctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED,false); img->integrity = INTEGRITY_DECODING_ERRORS; } else if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal && !isLongTermB && !isLongTermX) { int distB = img->PicOrderCntVal - refPicB->PicOrderCntVal; int distX = img->PicOrderCntVal - referenced_POC; logtrace(LogMotion,"scale MVP B: B-POC:%d X-POC:%d\n",refPicB->PicOrderCntVal,refPicX->PicOrderCntVal); if (!scale_mv(&out_mvLXN[B], out_mvLXN[B], distB, distX)) { ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); img->integrity = INTEGRITY_DECODING_ERRORS; } } } } } } // 8.5.3.1.5 void fill_luma_motion_vector_predictors(base_context* ctx, const slice_segment_header* shdr, de265_image* img, int xC,int yC,int nCS,int xP,int yP, int nPbW,int nPbH, int l, int refIdx, int partIdx, MotionVector out_mvpList[2]) { // 8.5.3.1.6: derive two spatial vector predictors A (0) and B (1) uint8_t availableFlagLXN[2]; MotionVector mvLXN[2]; derive_spatial_luma_vector_prediction(ctx, img, shdr, xC,yC, nCS, xP,yP, nPbW,nPbH, l, refIdx, partIdx, availableFlagLXN, mvLXN); // 8.5.3.1.7: if we only have one spatial vector or both spatial vectors are the same, // derive a temporal predictor uint8_t availableFlagLXCol; MotionVector mvLXCol; if (availableFlagLXN[0] && availableFlagLXN[1] && (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)) { availableFlagLXCol = 0; } else { derive_temporal_luma_vector_prediction(ctx, img, shdr, xP,yP, nPbW,nPbH, refIdx,l, &mvLXCol, &availableFlagLXCol); } // --- build candidate vector list with exactly two entries --- int numMVPCandLX=0; // spatial predictor A if (availableFlagLXN[0]) { out_mvpList[numMVPCandLX++] = mvLXN[0]; } // spatial predictor B (if not same as A) if (availableFlagLXN[1] && (!availableFlagLXN[0] || // in case A in not available, but mvLXA initialized to same as mvLXB (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y))) { out_mvpList[numMVPCandLX++] = mvLXN[1]; } // temporal predictor if (availableFlagLXCol) { out_mvpList[numMVPCandLX++] = mvLXCol; } // fill with zero predictors while (numMVPCandLX<2) { out_mvpList[numMVPCandLX].x = 0; out_mvpList[numMVPCandLX].y = 0; numMVPCandLX++; } assert(numMVPCandLX==2); } MotionVector luma_motion_vector_prediction(base_context* ctx, const slice_segment_header* shdr, de265_image* img, const PBMotionCoding& motion, int xC,int yC,int nCS,int xP,int yP, int nPbW,int nPbH, int l, int refIdx, int partIdx) { MotionVector mvpList[2]; fill_luma_motion_vector_predictors(ctx, shdr, img, xC,yC,nCS,xP,yP, nPbW, nPbH, l, refIdx, partIdx, mvpList); // select predictor according to mvp_lX_flag return mvpList[ l ? motion.mvp_l1_flag : motion.mvp_l0_flag ]; } #if DE265_LOG_TRACE void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const PBMotion* mv) { int pred0 = mv->predFlag[0]; int pred1 = mv->predFlag[1]; logtrace(LogMotion, "*MV %d;%d [%d;%d] %s: (%d) %d;%d @%d (%d) %d;%d @%d\n", x0,y0,nPbW,nPbH,mode, pred0, pred0 ? mv->mv[0].x : 0,pred0 ? mv->mv[0].y : 0, pred0 ? mv->refIdx[0] : 0, pred1, pred1 ? mv->mv[1].x : 0,pred1 ? mv->mv[1].y : 0, pred1 ? mv->refIdx[1] : 0); } #else #define logMV(x0,y0,nPbW,nPbH,mode,mv) #endif // 8.5.3.1 void motion_vectors_and_ref_indices(base_context* ctx, const slice_segment_header* shdr, de265_image* img, const PBMotionCoding& motion, int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx, PBMotion* out_vi) { //slice_segment_header* shdr = tctx->shdr; int xP = xC+xB; int yP = yC+yB; enum PredMode predMode = img->get_pred_mode(xC,yC); if (predMode == MODE_SKIP || (predMode == MODE_INTER && motion.merge_flag)) { derive_luma_motion_merge_mode(ctx,shdr,img, xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx, motion.merge_idx, out_vi); logMV(xP,yP,nPbW,nPbH, "merge_mode", out_vi); } else { int mvdL[2][2]; MotionVector mvpL[2]; for (int l=0;l<2;l++) { // 1. enum InterPredIdc inter_pred_idc = (enum InterPredIdc)motion.inter_pred_idc; if (inter_pred_idc == PRED_BI || (inter_pred_idc == PRED_L0 && l==0) || (inter_pred_idc == PRED_L1 && l==1)) { out_vi->refIdx[l] = motion.refIdx[l]; out_vi->predFlag[l] = 1; } else { out_vi->refIdx[l] = 0; out_vi->predFlag[l] = 0; } // 2. mvdL[l][0] = motion.mvd[l][0]; mvdL[l][1] = motion.mvd[l][1]; if (out_vi->predFlag[l]) { // 3. mvpL[l] = luma_motion_vector_prediction(ctx,shdr,img,motion, xC,yC,nCS,xP,yP, nPbW,nPbH, l, out_vi->refIdx[l], partIdx); // 4. int32_t x = (mvpL[l].x + mvdL[l][0] + 0x10000) & 0xFFFF; int32_t y = (mvpL[l].y + mvdL[l][1] + 0x10000) & 0xFFFF; out_vi->mv[l].x = (x>=0x8000) ? x-0x10000 : x; out_vi->mv[l].y = (y>=0x8000) ? y-0x10000 : y; } } logMV(xP,yP,nPbW,nPbH, "mvp", out_vi); } } // 8.5.3 /* xC/yC : CB position xB/yB : position offset of the PB nPbW/nPbH : size of PB nCS : CB size */ void decode_prediction_unit(base_context* ctx, const slice_segment_header* shdr, de265_image* img, const PBMotionCoding& motion, int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx) { logtrace(LogMotion,"decode_prediction_unit POC=%d %d;%d %dx%d\n", img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH); //slice_segment_header* shdr = tctx->shdr; // 1. PBMotion vi; motion_vectors_and_ref_indices(ctx, shdr, img, motion, xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi); // 2. generate_inter_prediction_samples(ctx,shdr, img, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi); img->set_mv_info(xC+xB,yC+yB,nPbW,nPbH, vi); } libde265-1.0.18/libde265/motion.h000066400000000000000000000106431515675107500161270ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_MOTION_H #define DE265_MOTION_H #include #include "slice.h" class base_context; class slice_segment_header; class MotionVector { public: int16_t x,y; }; class PBMotion { public: uint8_t predFlag[2]; // which of the two vectors is actually used uint8_t refIdx[2]; // index into RefPicList (valid when predFlag set) MotionVector mv[2]; // the absolute motion vectors bool operator==(const PBMotion&) const; }; class PBMotionCoding { public: // index into RefPicList uint8_t refIdx[2]; // motion vector difference int16_t mvd[2][2]; // [L0/L1][x/y] (only in top left position - ???) // enum InterPredIdc, whether this is prediction from L0,L1, or BI uint8_t inter_pred_idc : 2; // which of the two MVPs is used uint8_t mvp_l0_flag : 1; uint8_t mvp_l1_flag : 1; // whether merge mode is used uint8_t merge_flag : 1; uint8_t merge_idx : 3; }; void get_merge_candidate_list(base_context* ctx, const slice_segment_header* shdr, struct de265_image* img, int xC,int yC, int xP,int yP, int nCS, int nPbW,int nPbH, int partIdx, PBMotion* mergeCandList); /* int derive_spatial_merging_candidates(const struct de265_image* img, int xC, int yC, int nCS, int xP, int yP, uint8_t singleMCLFlag, int nPbW, int nPbH, int partIdx, MotionVectorSpec* out_cand, int maxCandidates); */ void generate_inter_prediction_samples(base_context* ctx, const slice_segment_header* shdr, struct de265_image* img, int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, const PBMotion* vi); /* Fill list (two entries) of motion-vector predictors for MVD coding. */ void fill_luma_motion_vector_predictors(base_context* ctx, const slice_segment_header* shdr, de265_image* img, int xC,int yC,int nCS,int xP,int yP, int nPbW,int nPbH, int l, int refIdx, int partIdx, MotionVector out_mvpList[2]); void decode_prediction_unit(base_context* ctx,const slice_segment_header* shdr, de265_image* img, const PBMotionCoding& motion, int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx); class MotionVectorAccess { public: virtual enum PartMode get_PartMode(int x,int y) const = 0; virtual const PBMotion& get_mv_info(int x,int y) const = 0; }; void get_merge_candidate_list_without_step_9(base_context* ctx, const slice_segment_header* shdr, const MotionVectorAccess& mvaccess, de265_image* img, int xC,int yC, int xP,int yP, int nCS, int nPbW,int nPbH, int partIdx, int max_merge_idx, PBMotion* mergeCandList); #endif libde265-1.0.18/libde265/nal-parser.cc000066400000000000000000000217231515675107500170250ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "nal-parser.h" #include #include #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif NAL_unit::NAL_unit() : skipped_bytes(DE265_SKIPPED_BYTES_INITIAL_SIZE) { } NAL_unit::~NAL_unit() { free(nal_data); } void NAL_unit::clear() { header = nal_header(); pts = 0; user_data = nullptr; // set size to zero but keep memory data_size = 0; skipped_bytes.clear(); } LIBDE265_CHECK_RESULT bool NAL_unit::resize(int new_size) { if (capacity < new_size) { unsigned char* newbuffer = static_cast(malloc(new_size)); if (newbuffer == nullptr) { return false; } if (nal_data != nullptr) { memcpy(newbuffer, nal_data, data_size); free(nal_data); } nal_data = newbuffer; capacity = new_size; } return true; } LIBDE265_CHECK_RESULT bool NAL_unit::append(const unsigned char* in_data, int n) { if (!resize(data_size + n)) { return false; } memcpy(nal_data + data_size, in_data, n); data_size += n; return true; } bool LIBDE265_CHECK_RESULT NAL_unit::set_data(const unsigned char* in_data, int n) { if (!resize(n)) { return false; } memcpy(nal_data, in_data, n); data_size = n; return true; } void NAL_unit::insert_skipped_byte(uint32_t pos) { skipped_bytes.push_back(pos); } uint32_t NAL_unit::num_skipped_bytes_before(uint32_t byte_position, uint32_t headerLength) const { if (skipped_bytes.empty()) { return 0; } for (int k=skipped_bytes.size()-1;k>=0;k--) if (skipped_bytes[k] >= headerLength && skipped_bytes[k]-headerLength <= byte_position) { return k+1; } return 0; } void NAL_unit::remove_stuffing_bytes() { uint8_t* p = data(); for (int i=0;i 0) { nal = NAL_free_list.back(); NAL_free_list.pop_back(); } else { nal = new NAL_unit; } nal->clear(); if (!nal->resize(size)) { free_NAL_unit(nal); return nullptr; } return nal; } void NAL_Parser::free_NAL_unit(NAL_unit* nal) { if (nal == nullptr) { // Allow calling with nullptr just like regular "free()" return; } if (NAL_free_list.size() < DE265_NAL_FREE_LIST_SIZE) { NAL_free_list.push_back(nal); } else { delete nal; } } NAL_unit* NAL_Parser::pop_from_NAL_queue() { if (NAL_queue.empty()) { return nullptr; } else { NAL_unit* nal = NAL_queue.front(); NAL_queue.pop(); nBytes_in_NAL_queue -= nal->size(); return nal; } } void NAL_Parser::push_to_NAL_queue(NAL_unit* nal) { NAL_queue.push(nal); nBytes_in_NAL_queue += nal->size(); } de265_error NAL_Parser::push_data(const unsigned char* data, int len, de265_PTS pts, void* user_data) { end_of_frame = false; if (pending_input_NAL == nullptr) { pending_input_NAL = alloc_NAL_unit(len+3); if (pending_input_NAL == nullptr) { return DE265_ERROR_OUT_OF_MEMORY; } pending_input_NAL->pts = pts; pending_input_NAL->user_data = user_data; } NAL_unit* nal = pending_input_NAL; // shortcut // Resize output buffer so that complete input would fit. // We add 3, because in the worst case 3 extra bytes are created for an input byte. if (!nal->resize(nal->size() + len + 3)) { return DE265_ERROR_OUT_OF_MEMORY; } unsigned char* out = nal->data() + nal->size(); for (int i=0;iinput_push_state, *data, data, out - ctx->nal_data.data); */ switch (input_push_state) { case 0: case 1: if (*data == 0) { input_push_state++; } else { input_push_state=0; } break; case 2: if (*data == 1) { input_push_state=3; } // nal->clear_skipped_bytes(); } else if (*data == 0) { } // *out++ = 0; } else { input_push_state=0; } break; case 3: *out++ = *data; input_push_state = 4; break; case 4: *out++ = *data; input_push_state = 5; break; case 5: if (*data==0) { input_push_state=6; } else { *out++ = *data; } break; case 6: if (*data==0) { input_push_state=7; } else { *out++ = 0; *out++ = *data; input_push_state=5; } break; case 7: if (*data==0) { *out++ = 0; } else if (*data==3) { *out++ = 0; *out++ = 0; input_push_state=5; // remember which byte we removed nal->insert_skipped_byte((out - nal->data()) + nal->num_skipped_bytes()); } else if (*data==1) { #if DEBUG_INSERT_STREAM_ERRORS if ((rand()%100)<90 && nal_data.size>0) { int pos = rand()%nal_data.size; int bit = rand()%8; nal->nal_data.data[pos] ^= 1<set_size(out - nal->data());; // push this NAL decoder queue push_to_NAL_queue(nal); // initialize new, empty NAL unit pending_input_NAL = alloc_NAL_unit(len+3); if (pending_input_NAL == nullptr) { return DE265_ERROR_OUT_OF_MEMORY; } pending_input_NAL->pts = pts; pending_input_NAL->user_data = user_data; nal = pending_input_NAL; out = nal->data(); input_push_state=3; //nal->clear_skipped_bytes(); } else { *out++ = 0; *out++ = 0; *out++ = *data; input_push_state=5; } break; } data++; } nal->set_size(out - nal->data()); return DE265_OK; } de265_error NAL_Parser::push_NAL(const unsigned char* data, int len, de265_PTS pts, void* user_data) { // Cannot use byte-stream input and NAL input at the same time. assert(pending_input_NAL == nullptr); end_of_frame = false; NAL_unit* nal = alloc_NAL_unit(len); if (nal == nullptr || !nal->set_data(data, len)) { free_NAL_unit(nal); return DE265_ERROR_OUT_OF_MEMORY; } nal->pts = pts; nal->user_data = user_data; nal->remove_stuffing_bytes(); push_to_NAL_queue(nal); return DE265_OK; } de265_error NAL_Parser::flush_data() { if (pending_input_NAL) { NAL_unit* nal = pending_input_NAL; uint8_t null[2] = { 0,0 }; // append bytes that are implied by the push state if (input_push_state==6) { if (!nal->append(null,1)) { return DE265_ERROR_OUT_OF_MEMORY; } } if (input_push_state==7) { if (!nal->append(null,2)) { return DE265_ERROR_OUT_OF_MEMORY; } } // only push the NAL if it contains at least the NAL header if (input_push_state>=5) { push_to_NAL_queue(nal); pending_input_NAL = nullptr; } input_push_state = 0; } return DE265_OK; } void NAL_Parser::remove_pending_input_data() { // --- remove pending input data --- if (pending_input_NAL) { free_NAL_unit(pending_input_NAL); pending_input_NAL = nullptr; } for (;;) { NAL_unit* nal = pop_from_NAL_queue(); if (nal) { free_NAL_unit(nal); } else break; } input_push_state = 0; nBytes_in_NAL_queue = 0; } libde265-1.0.18/libde265/nal-parser.h000066400000000000000000000102001515675107500166530ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_NAL_PARSER_H #define DE265_NAL_PARSER_H #include "libde265/sps.h" #include "libde265/pps.h" #include "libde265/nal.h" #include "libde265/util.h" #include #include constexpr int DE265_NAL_FREE_LIST_SIZE = 16; constexpr int DE265_SKIPPED_BYTES_INITIAL_SIZE = 16; class NAL_unit { public: NAL_unit(); ~NAL_unit(); nal_header header; de265_PTS pts = 0; void* user_data = nullptr; void clear(); // --- rbsp data --- LIBDE265_CHECK_RESULT bool resize(int new_size); LIBDE265_CHECK_RESULT bool append(const unsigned char* data, int n); LIBDE265_CHECK_RESULT bool set_data(const unsigned char* data, int n); int size() const { return data_size; } void set_size(int s) { data_size=s; } unsigned char* data() { return nal_data; } const unsigned char* data() const { return nal_data; } // --- skipped stuffing bytes --- uint32_t num_skipped_bytes_before(uint32_t byte_position, uint32_t headerLength) const; uint32_t num_skipped_bytes() const { return skipped_bytes.size(); } //void clear_skipped_bytes() { skipped_bytes.clear(); } /* Mark a byte as skipped. It is assumed that the byte is already removed from the input data. The NAL data is not modified. */ void insert_skipped_byte(uint32_t pos); /* Remove all stuffing bytes from NAL data. The NAL data is modified and the removed bytes are marked as skipped bytes. */ void remove_stuffing_bytes(); private: unsigned char* nal_data = nullptr; int data_size = 0; int capacity = 0; std::vector skipped_bytes; // up to position[x], there were 'x' skipped bytes }; class NAL_Parser { public: NAL_Parser(); ~NAL_Parser(); de265_error push_data(const unsigned char* data, int len, de265_PTS pts, void* user_data = nullptr); de265_error push_NAL(const unsigned char* data, int len, de265_PTS pts, void* user_data = nullptr); NAL_unit* pop_from_NAL_queue(); de265_error flush_data(); void mark_end_of_stream() { end_of_stream=true; } void mark_end_of_frame() { end_of_frame=true; } void remove_pending_input_data(); int bytes_in_input_queue() const { int size = nBytes_in_NAL_queue; if (pending_input_NAL) { size += pending_input_NAL->size(); } return size; } int number_of_NAL_units_pending() const { int size = NAL_queue.size(); if (pending_input_NAL) { size++; } return size; } int number_of_complete_NAL_units_pending() const { return NAL_queue.size(); } void free_NAL_unit(NAL_unit*); int get_NAL_queue_length() const { return NAL_queue.size(); } bool is_end_of_stream() const { return end_of_stream; } bool is_end_of_frame() const { return end_of_frame; } private: // byte-stream level bool end_of_stream = false; // data in pending_input_data is end of stream bool end_of_frame = false; // data in pending_input_data is end of frame int input_push_state = 0; NAL_unit* pending_input_NAL = nullptr; // NAL level std::queue NAL_queue; // enqueued NALs have suffing bytes removed int nBytes_in_NAL_queue = 0; // data bytes currently in NAL_queue void push_to_NAL_queue(NAL_unit*); // pool of unused NAL memory std::vector NAL_free_list; // maximum size: DE265_NAL_FREE_LIST_SIZE LIBDE265_CHECK_RESULT NAL_unit* alloc_NAL_unit(int size); }; #endif libde265-1.0.18/libde265/nal.cc000066400000000000000000000074301515675107500155320ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "nal.h" #include "cabac.h" #include de265_error nal_header::read(bitreader* reader) { reader->skip_bits(1); nal_unit_type = reader->get_bits(6); nuh_layer_id = reader->get_bits(6); uint32_t nuh_temporal_id_plus1 = reader->get_bits(3); if (nuh_temporal_id_plus1 == 0) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } nuh_temporal_id = nuh_temporal_id_plus1 - 1; return DE265_OK; } void nal_header::write(CABAC_encoder& out) const { out.skip_bits(1); out.write_bits(nal_unit_type,6); out.write_bits(nuh_layer_id ,6); out.write_bits(nuh_temporal_id+1,3); } bool isIDR(uint8_t unit_type) { return (unit_type == NAL_UNIT_IDR_W_RADL || unit_type == NAL_UNIT_IDR_N_LP); } bool isBLA(uint8_t unit_type) { return (unit_type == NAL_UNIT_BLA_W_LP || unit_type == NAL_UNIT_BLA_W_RADL || unit_type == NAL_UNIT_BLA_N_LP); } bool isCRA(uint8_t unit_type) { return unit_type == NAL_UNIT_CRA_NUT; } bool isRAP(uint8_t unit_type) { return isIDR(unit_type) || isBLA(unit_type) || isCRA(unit_type); } bool isRASL(uint8_t unit_type) { return (unit_type == NAL_UNIT_RASL_N || unit_type == NAL_UNIT_RASL_R); } bool isIRAP(uint8_t unit_type) { return (unit_type >= NAL_UNIT_BLA_W_LP && unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23); } bool isRADL(uint8_t unit_type) { return (unit_type == NAL_UNIT_RADL_N || unit_type == NAL_UNIT_RADL_R); } bool isReferenceNALU(uint8_t unit_type) { return ( ((unit_type <= NAL_UNIT_RESERVED_VCL_R15) && (unit_type%2 != 0)) || ((unit_type >= NAL_UNIT_BLA_W_LP) && (unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23)) ); } bool isSublayerNonReference(uint8_t unit_type) { switch (unit_type) { case NAL_UNIT_TRAIL_N: case NAL_UNIT_TSA_N: case NAL_UNIT_STSA_N: case NAL_UNIT_RADL_N: case NAL_UNIT_RASL_N: case NAL_UNIT_RESERVED_VCL_N10: case NAL_UNIT_RESERVED_VCL_N12: case NAL_UNIT_RESERVED_VCL_N14: return true; default: return false; } } static const char* NAL_unit_name[] = { "TRAIL_N", // 0 "TRAIL_R", "TSA_N", "TSA_R", "STSA_N", "STSA_R", // 5 "RADL_N", "RADL_R", "RASL_N", "RASL_R", "RESERVED_VCL_N10", // 10 "RESERVED_VCL_R11", "RESERVED_VCL_N12", "RESERVED_VCL_R13", "RESERVED_VCL_N14", "RESERVED_VCL_R15", // 15 "BLA_W_LP", "BLA_W_RADL", "BLA_N_LP", "IDR_W_RADL", "IDR_N_LP", // 20 "CRA_NUT", "RESERVED_IRAP_VCL22", "RESERVED_IRAP_VCL23", "RESERVED_VCL24", "RESERVED_VCL25", // 25 "RESERVED_VCL26", "RESERVED_VCL27", "RESERVED_VCL28", "RESERVED_VCL29", "RESERVED_VCL30", // 30 "RESERVED_VCL31", "VPS", "SPS", "PPS", "AUD", // 35 "EOS", "EOB", "FD", "PREFIX_SEI", "SUFFIX_SEI", // 40 "RESERVED_NVCL41", "RESERVED_NVCL42", "RESERVED_NVCL43", "RESERVED_NVCL44", "RESERVED_NVCL45", // 45 "RESERVED_NVCL46", "RESERVED_NVCL47" }; const char* get_NAL_name(uint8_t unit_type) { if (unit_type >= 48) { return "INVALID NAL >= 48"; } return NAL_unit_name[unit_type]; } libde265-1.0.18/libde265/nal.h000066400000000000000000000073531515675107500154000ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_NAL_H #define DE265_NAL_H #ifdef HAVE_CONFIG_H #include #endif #include #include "libde265/bitstream.h" #include "libde265/cabac.h" #include "libde265/de265.h" struct nal_header { nal_header() { nal_unit_type = 0; nuh_layer_id = 0; nuh_temporal_id = 0; } de265_error read(bitreader* reader); void write(CABAC_encoder& writer) const; void set(int unit_type, int layer_id=0, int temporal_id=0) { nal_unit_type =unit_type; nuh_layer_id =layer_id; nuh_temporal_id=temporal_id; } uint8_t nal_unit_type; uint8_t nuh_layer_id; uint8_t nuh_temporal_id; }; #define NAL_UNIT_TRAIL_N 0 #define NAL_UNIT_TRAIL_R 1 #define NAL_UNIT_TSA_N 2 #define NAL_UNIT_TSA_R 3 #define NAL_UNIT_STSA_N 4 #define NAL_UNIT_STSA_R 5 #define NAL_UNIT_RADL_N 6 #define NAL_UNIT_RADL_R 7 #define NAL_UNIT_RASL_N 8 #define NAL_UNIT_RASL_R 9 #define NAL_UNIT_RESERVED_VCL_N10 10 #define NAL_UNIT_RESERVED_VCL_N12 12 #define NAL_UNIT_RESERVED_VCL_N14 14 #define NAL_UNIT_RESERVED_VCL_R11 11 #define NAL_UNIT_RESERVED_VCL_R13 13 #define NAL_UNIT_RESERVED_VCL_R15 15 #define NAL_UNIT_BLA_W_LP 16 // BLA = broken link access #define NAL_UNIT_BLA_W_RADL 17 #define NAL_UNIT_BLA_N_LP 18 #define NAL_UNIT_IDR_W_RADL 19 #define NAL_UNIT_IDR_N_LP 20 #define NAL_UNIT_CRA_NUT 21 // CRA = clean random access #define NAL_UNIT_RESERVED_IRAP_VCL22 22 #define NAL_UNIT_RESERVED_IRAP_VCL23 23 #define NAL_UNIT_RESERVED_VCL24 24 #define NAL_UNIT_RESERVED_VCL25 25 #define NAL_UNIT_RESERVED_VCL26 26 #define NAL_UNIT_RESERVED_VCL27 27 #define NAL_UNIT_RESERVED_VCL28 28 #define NAL_UNIT_RESERVED_VCL29 29 #define NAL_UNIT_RESERVED_VCL30 30 #define NAL_UNIT_RESERVED_VCL31 31 #define NAL_UNIT_VPS_NUT 32 #define NAL_UNIT_SPS_NUT 33 #define NAL_UNIT_PPS_NUT 34 #define NAL_UNIT_AUD_NUT 35 #define NAL_UNIT_EOS_NUT 36 #define NAL_UNIT_EOB_NUT 37 #define NAL_UNIT_FD_NUT 38 #define NAL_UNIT_PREFIX_SEI_NUT 39 #define NAL_UNIT_SUFFIX_SEI_NUT 40 #define NAL_UNIT_RESERVED_NVCL41 41 #define NAL_UNIT_RESERVED_NVCL42 42 #define NAL_UNIT_RESERVED_NVCL43 43 #define NAL_UNIT_RESERVED_NVCL44 44 #define NAL_UNIT_RESERVED_NVCL45 45 #define NAL_UNIT_RESERVED_NVCL46 46 #define NAL_UNIT_RESERVED_NVCL47 47 #define NAL_UNIT_UNDEFINED 255 bool isIDR(uint8_t unit_type); bool isBLA(uint8_t unit_type); bool isCRA(uint8_t unit_type); bool isRAP(uint8_t unit_type); bool isRASL(uint8_t unit_type); bool isIRAP(uint8_t unit_type); bool isRADL(uint8_t unit_type); bool isReferenceNALU(uint8_t unit_type); bool isSublayerNonReference(uint8_t unit_type); const char* get_NAL_name(uint8_t unit_type); inline bool isIdrPic(uint8_t nal_unit_type) { return (nal_unit_type == NAL_UNIT_IDR_W_RADL || nal_unit_type == NAL_UNIT_IDR_N_LP); } inline bool isRapPic(uint8_t nal_unit_type) { return nal_unit_type >= 16 && nal_unit_type <= 23; } #endif libde265-1.0.18/libde265/pps.cc000066400000000000000000000663421515675107500155710ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "pps.h" #include "decctx.h" #include "util.h" #include #include #include #if defined(_MSC_VER) || defined(__MINGW32__) # include #elif defined(HAVE_ALLOCA_H) # include #endif void pps_range_extension::reset() { log2_max_transform_skip_block_size = 2; cross_component_prediction_enabled_flag = false; chroma_qp_offset_list_enabled_flag = false; diff_cu_chroma_qp_offset_depth = 0; chroma_qp_offset_list_len = 0; log2_sao_offset_scale_luma = 0; log2_sao_offset_scale_chroma = 0; } bool pps_range_extension::read(bitreader* br, decoder_context* ctx, const pic_parameter_set* pps) { const seq_parameter_set* sps = ctx->get_sps(pps->seq_parameter_set_id); uint32_t uvlc; if (pps->transform_skip_enabled_flag) { uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > static_cast(sps->Log2MaxTrafoSize) - 2) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } log2_max_transform_skip_block_size = uvlc+2; } cross_component_prediction_enabled_flag = br->get_bits(1); if (sps->ChromaArrayType != CHROMA_444 && cross_component_prediction_enabled_flag) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); } chroma_qp_offset_list_enabled_flag = br->get_bits(1); if (sps->ChromaArrayType == CHROMA_MONO && chroma_qp_offset_list_enabled_flag) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); } if (chroma_qp_offset_list_enabled_flag) { uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > static_cast(sps->log2_diff_max_min_luma_coding_block_size)) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } diff_cu_chroma_qp_offset_depth = uvlc; uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > 5) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } chroma_qp_offset_list_len = uvlc+1; for (int i=0;iget_svlc(); if (svlc == SVLC_ERROR || svlc < -12 || svlc > 12) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } cb_qp_offset_list[i] = svlc; svlc = br->get_svlc(); if (svlc == SVLC_ERROR || svlc < -12 || svlc > 12) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } cr_qp_offset_list[i] = svlc; } } uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > static_cast(libde265_max(0, sps->BitDepth_Y-10))) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } log2_sao_offset_scale_luma = uvlc; uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > static_cast(libde265_max(0, sps->BitDepth_C-10))) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } log2_sao_offset_scale_chroma = uvlc; return true; } void pps_range_extension::dump(int fd) const { FILE* fh; if (fd==1) fh=stdout; else if (fd==2) fh=stderr; else { return; } #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) #define LOG2(t,d,e) log2fh(fh, t,d,e) LOG0("---------- PPS range-extension ----------\n"); LOG1("log2_max_transform_skip_block_size : %d\n", log2_max_transform_skip_block_size); LOG1("cross_component_prediction_enabled_flag : %d\n", cross_component_prediction_enabled_flag); LOG1("chroma_qp_offset_list_enabled_flag : %d\n", chroma_qp_offset_list_enabled_flag); if (chroma_qp_offset_list_enabled_flag) { LOG1("diff_cu_chroma_qp_offset_depth : %d\n", diff_cu_chroma_qp_offset_depth); LOG1("chroma_qp_offset_list_len : %d\n", chroma_qp_offset_list_len); for (int i=0;iget_uvlc(); if (uvlc == UVLC_ERROR || uvlc >= DE265_MAX_PPS_SETS) { ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); return false; } pic_parameter_set_id = uvlc; uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc >= DE265_MAX_SPS_SETS) { ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); return false; } seq_parameter_set_id = uvlc; dependent_slice_segments_enabled_flag = br->get_bits(1); output_flag_present_flag = br->get_bits(1); num_extra_slice_header_bits = br->get_bits(3); sign_data_hiding_flag = br->get_bits(1); cabac_init_present_flag = br->get_bits(1); uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > 15) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } num_ref_idx_l0_default_active = uvlc + 1; uvlc = br->get_uvlc(); if (uvlc == UVLC_ERROR || uvlc > 15) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } num_ref_idx_l1_default_active = uvlc + 1; if (!ctx->has_sps(seq_parameter_set_id)) { ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); return false; } sps = ctx->get_shared_sps(seq_parameter_set_id); { int32_t svlc; if ((svlc = br->get_svlc()) == SVLC_ERROR) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } pic_init_qp = svlc + 26; } constrained_intra_pred_flag = br->get_bits(1); transform_skip_enabled_flag = br->get_bits(1); cu_qp_delta_enabled_flag = br->get_bits(1); if (cu_qp_delta_enabled_flag) { // diff_cu_qp_delta_depth shall be in [0, log2_diff_max_min_luma_coding_block_size] (Sec. 7.4.3.3.1) if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > sps->log2_diff_max_min_luma_coding_block_size) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } diff_cu_qp_delta_depth = uvlc; } else { diff_cu_qp_delta_depth = 0; } { int32_t svlc; if ((svlc = br->get_svlc()) == SVLC_ERROR) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } pic_cb_qp_offset = svlc; } { int32_t svlc; if ((svlc = br->get_svlc()) == SVLC_ERROR) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } pic_cr_qp_offset = svlc; } pps_slice_chroma_qp_offsets_present_flag = br->get_bits(1); weighted_pred_flag = br->get_bits(1); weighted_bipred_flag = br->get_bits(1); transquant_bypass_enable_flag = br->get_bits(1); tiles_enabled_flag = br->get_bits(1); entropy_coding_sync_enabled_flag = br->get_bits(1); // --- tiles --- if (tiles_enabled_flag) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc+1 > DE265_MAX_TILE_COLUMNS) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } num_tile_columns = uvlc+1; if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc+1 > DE265_MAX_TILE_ROWS) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } num_tile_rows = uvlc+1; uniform_spacing_flag = br->get_bits(1); if (uniform_spacing_flag==false) { uint16_t lastColumnWidth = sps->PicWidthInCtbsY; uint16_t lastRowHeight = sps->PicHeightInCtbsY; for (int i=0; iget_uvlc()) == UVLC_ERROR || uvlc >= lastColumnWidth) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } colWidth[i] = uvlc+1; lastColumnWidth -= colWidth[i]; } colWidth[num_tile_columns-1] = lastColumnWidth; for (int i=0; iget_uvlc()) == UVLC_ERROR || uvlc >= lastRowHeight) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } rowHeight[i] = uvlc+1; lastRowHeight -= rowHeight[i]; } rowHeight[num_tile_rows-1] = lastRowHeight; } loop_filter_across_tiles_enabled_flag = br->get_bits(1); } else { num_tile_columns = 1; num_tile_rows = 1; uniform_spacing_flag = 1; loop_filter_across_tiles_enabled_flag = 0; } // END tiles beta_offset = 0; // default value tc_offset = 0; // default value pps_loop_filter_across_slices_enabled_flag = br->get_bits(1); deblocking_filter_control_present_flag = br->get_bits(1); if (deblocking_filter_control_present_flag) { deblocking_filter_override_enabled_flag = br->get_bits(1); pic_disable_deblocking_filter_flag = br->get_bits(1); if (!pic_disable_deblocking_filter_flag) { { int32_t svlc; // pps_beta_offset_div2 shall be in [-6, 6] (Sec. 7.4.3.3.1) if ((svlc = br->get_svlc()) == SVLC_ERROR || svlc < -6 || svlc > 6) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } beta_offset = svlc * 2; // pps_tc_offset_div2 shall be in [-6, 6] (Sec. 7.4.3.3.1) if ((svlc = br->get_svlc()) == SVLC_ERROR || svlc < -6 || svlc > 6) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } tc_offset = svlc * 2; } } } else { deblocking_filter_override_enabled_flag = 0; pic_disable_deblocking_filter_flag = 0; } // --- scaling list --- pic_scaling_list_data_present_flag = br->get_bits(1); // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag // must be FALSE if (sps->scaling_list_enable_flag==0 && pic_scaling_list_data_present_flag != 0) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } if (pic_scaling_list_data_present_flag) { de265_error err = read_scaling_list(br, sps.get(), &scaling_list, true); if (err != DE265_OK) { ctx->add_warning(err, false); return false; } } else { memcpy(&scaling_list, &sps->scaling_list, sizeof(scaling_list_data)); } lists_modification_present_flag = br->get_bits(1); if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 4) { ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } log2_parallel_merge_level = uvlc + 2; if (log2_parallel_merge_level-2 > sps->log2_min_luma_coding_block_size-3 +1 + sps->log2_diff_max_min_luma_coding_block_size) { return false; } slice_segment_header_extension_present_flag = br->get_bits(1); pps_extension_flag = br->get_bits(1); if (pps_extension_flag) { pps_range_extension_flag = br->get_bits(1); pps_multilayer_extension_flag = br->get_bits(1); pps_extension_6bits = br->get_bits(6); if (pps_range_extension_flag) { bool success = range_extension.read(br, ctx, this); if (!success) { return false; } } //assert(false); /* while( more_rbsp_data() ) pps_extension_data_flag u(1) rbsp_trailing_bits() } */ } set_derived_values(sps.get()); pps_read = true; return true; } void pic_parameter_set::set_derived_values(const seq_parameter_set* sps) { Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - diff_cu_qp_delta_depth; Log2MinCuChromaQpOffsetSize = sps->Log2CtbSizeY - range_extension.diff_cu_chroma_qp_offset_depth; Log2MaxTransformSkipSize = range_extension.log2_max_transform_skip_block_size; if (uniform_spacing_flag) { // set columns widths int *const colPos = static_cast(alloca((num_tile_columns+1) * sizeof(int))); for (int i=0;i<=num_tile_columns;i++) { colPos[i] = i*sps->PicWidthInCtbsY / num_tile_columns; } for (int i=0;i(alloca((num_tile_rows+1) * sizeof(int))); for (int i=0;i<=num_tile_rows;i++) { rowPos[i] = i*sps->PicHeightInCtbsY / num_tile_rows; } for (int i=0;iPicSizeInCtbsY); CtbAddrTStoRS.resize(sps->PicSizeInCtbsY); TileId .resize(sps->PicSizeInCtbsY); TileIdRS .resize(sps->PicSizeInCtbsY); MinTbAddrZS .resize(sps->PicSizeInTbsY ); // raster scan (RS) <-> tile scan (TS) conversion for (uint32_t ctbAddrRS=0 ; ctbAddrRS < sps->PicSizeInCtbsY ; ctbAddrRS++) { int tbX = ctbAddrRS % sps->PicWidthInCtbsY; int tbY = ctbAddrRS / sps->PicWidthInCtbsY; int tileX=-1,tileY=-1; for (int i=0;i= colBd[i]) tileX=i; for (int j=0;j= rowBd[j]) tileY=j; CtbAddrRStoTS[ctbAddrRS] = 0; for (int i=0;iCtbAddrRStoTS[ctbAddrRS] += (tbY - pps->rowBd[tileY])*pps->colWidth[tileX]; //pps->CtbAddrRStoTS[ctbAddrRS] += tbX - pps->colBd[tileX]; CtbAddrRStoTS[ctbAddrRS] += sps->PicWidthInCtbsY * rowHeight[j]; } assert(tileX>=0 && tileY>=0); CtbAddrRStoTS[ctbAddrRS] += (tbY-rowBd[tileY])*colWidth[tileX]; CtbAddrRStoTS[ctbAddrRS] += tbX - colBd[tileX]; // inverse mapping CtbAddrTStoRS[ CtbAddrRStoTS[ctbAddrRS] ] = ctbAddrRS; } #if 0 logtrace(LogHeaders,"6.5.1 CtbAddrRSToTS\n"); for (int y=0;yPicHeightInCtbsY;y++) { for (int x=0;xPicWidthInCtbsY;x++) { logtrace(LogHeaders,"%3d ", CtbAddrRStoTS[x + y*sps->PicWidthInCtbsY]); } logtrace(LogHeaders,"\n"); } #endif // tile id for (int j=0, tIdx=0 ; jPicWidthInCtbsY + x] ] = tIdx; TileIdRS[ y*sps->PicWidthInCtbsY + x ] = tIdx; //logtrace(LogHeaders,"tileID[%d,%d] = %d\n",x,y,pps->TileIdRS[ y*sps->PicWidthInCtbsY + x ]); } tIdx++; } #if 0 logtrace(LogHeaders,"Tile IDs RS:\n"); for (int y=0;yPicHeightInCtbsY;y++) { for (int x=0;xPicWidthInCtbsY;x++) { logtrace(LogHeaders,"%2d ",TileIdRS[y*sps->PicWidthInCtbsY+x]); } logtrace(LogHeaders,"\n"); } #endif // 6.5.2 Z-scan order array initialization process for (int y=0;yPicHeightInTbsY;y++) for (int x=0;xPicWidthInTbsY;x++) { int tbX = (x<Log2MinTrafoSize)>>sps->Log2CtbSizeY; int tbY = (y<Log2MinTrafoSize)>>sps->Log2CtbSizeY; int ctbAddrRS = sps->PicWidthInCtbsY*tbY + tbX; MinTbAddrZS[x + y*sps->PicWidthInTbsY] = CtbAddrRStoTS[ctbAddrRS] << ((sps->Log2CtbSizeY-sps->Log2MinTrafoSize)*2); int p=0; for (int i=0 ; i<(sps->Log2CtbSizeY - sps->Log2MinTrafoSize) ; i++) { int m=1<PicWidthInTbsY] += p; } // --- debug logging --- /* logtrace(LogHeaders,"6.5.2 Z-scan order array\n"); for (int y=0;yPicHeightInTbsY;y++) { for (int x=0;xPicWidthInTbsY;x++) { logtrace(LogHeaders,"%4d ", pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY]); } logtrace(LogHeaders,"\n"); } for (int i=0;iPicSizeInTbsY;i++) { for (int y=0;yPicHeightInTbsY;y++) { for (int x=0;xPicWidthInTbsY;x++) { if (pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] == i) { logtrace(LogHeaders,"%d %d\n",x,y); } } } } */ } bool pic_parameter_set::write(error_queue* errqueue, CABAC_encoder& out, const seq_parameter_set* sps) { if (pic_parameter_set_id >= DE265_MAX_PPS_SETS) { errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); return false; } out.write_uvlc(pic_parameter_set_id); if (seq_parameter_set_id >= DE265_MAX_PPS_SETS) { errqueue->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); return false; } out.write_uvlc(seq_parameter_set_id); out.write_bit(dependent_slice_segments_enabled_flag); out.write_bit(output_flag_present_flag); out.write_bits(num_extra_slice_header_bits,3); out.write_bit(sign_data_hiding_flag); out.write_bit(cabac_init_present_flag); out.write_uvlc(num_ref_idx_l0_default_active-1); out.write_uvlc(num_ref_idx_l1_default_active-1); out.write_svlc(pic_init_qp-26); out.write_bit(constrained_intra_pred_flag); out.write_bit(transform_skip_enabled_flag); out.write_bit(cu_qp_delta_enabled_flag); if (cu_qp_delta_enabled_flag) { out.write_uvlc(diff_cu_qp_delta_depth); } out.write_svlc(pic_cb_qp_offset); out.write_svlc(pic_cr_qp_offset); out.write_bit(pps_slice_chroma_qp_offsets_present_flag); out.write_bit(weighted_pred_flag); out.write_bit(weighted_bipred_flag); out.write_bit(transquant_bypass_enable_flag); out.write_bit(tiles_enabled_flag); out.write_bit(entropy_coding_sync_enabled_flag); // --- tiles --- if (tiles_enabled_flag) { if (num_tile_columns > DE265_MAX_TILE_COLUMNS) { errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } out.write_uvlc(num_tile_columns-1); if (num_tile_rows > DE265_MAX_TILE_ROWS) { errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } out.write_uvlc(num_tile_rows-1); out.write_bit(uniform_spacing_flag); if (uniform_spacing_flag==false) { for (int i=0; iscaling_list_enable_flag==0 && pic_scaling_list_data_present_flag != 0) { errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); return false; } if (pic_scaling_list_data_present_flag) { de265_error err = write_scaling_list(out, sps, &scaling_list, true); if (err != DE265_OK) { errqueue->add_warning(err, false); return false; } } out.write_bit(lists_modification_present_flag); out.write_uvlc(log2_parallel_merge_level-2); out.write_bit(slice_segment_header_extension_present_flag); out.write_bit(pps_extension_flag); if (pps_extension_flag) { //assert(false); /* while( more_rbsp_data() ) pps_extension_data_flag u(1) rbsp_trailing_bits() } */ } pps_read = true; return true; } void pic_parameter_set::dump(int fd) const { FILE* fh; if (fd==1) fh=stdout; else if (fd==2) fh=stderr; else { return; } #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) LOG0("----------------- PPS -----------------\n"); LOG1("pic_parameter_set_id : %d\n", pic_parameter_set_id); LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id); LOG1("dependent_slice_segments_enabled_flag : %d\n", dependent_slice_segments_enabled_flag); LOG1("sign_data_hiding_flag : %d\n", sign_data_hiding_flag); LOG1("cabac_init_present_flag : %d\n", cabac_init_present_flag); LOG1("num_ref_idx_l0_default_active : %d\n", num_ref_idx_l0_default_active); LOG1("num_ref_idx_l1_default_active : %d\n", num_ref_idx_l1_default_active); LOG1("pic_init_qp : %d\n", pic_init_qp); LOG1("constrained_intra_pred_flag: %d\n", constrained_intra_pred_flag); LOG1("transform_skip_enabled_flag: %d\n", transform_skip_enabled_flag); LOG1("cu_qp_delta_enabled_flag : %d\n", cu_qp_delta_enabled_flag); if (cu_qp_delta_enabled_flag) { LOG1("diff_cu_qp_delta_depth : %d\n", diff_cu_qp_delta_depth); } LOG1("pic_cb_qp_offset : %d\n", pic_cb_qp_offset); LOG1("pic_cr_qp_offset : %d\n", pic_cr_qp_offset); LOG1("pps_slice_chroma_qp_offsets_present_flag : %d\n", pps_slice_chroma_qp_offsets_present_flag); LOG1("weighted_pred_flag : %d\n", weighted_pred_flag); LOG1("weighted_bipred_flag : %d\n", weighted_bipred_flag); LOG1("output_flag_present_flag : %d\n", output_flag_present_flag); LOG1("transquant_bypass_enable_flag: %d\n", transquant_bypass_enable_flag); LOG1("tiles_enabled_flag : %d\n", tiles_enabled_flag); LOG1("entropy_coding_sync_enabled_flag: %d\n", entropy_coding_sync_enabled_flag); if (tiles_enabled_flag) { LOG1("num_tile_columns : %d\n", num_tile_columns); LOG1("num_tile_rows : %d\n", num_tile_rows); LOG1("uniform_spacing_flag: %d\n", uniform_spacing_flag); LOG0("tile column boundaries: "); for (int i=0;i<=num_tile_columns;i++) { LOG1("*%d ",colBd[i]); } LOG0("*\n"); LOG0("tile row boundaries: "); for (int i=0;i<=num_tile_rows;i++) { LOG1("*%d ",rowBd[i]); } LOG0("*\n"); //if( !uniform_spacing_flag ) { /* for( i = 0; i < num_tile_columns_minus1; i++ ) column_width_minus1[i] ue(v) for( i = 0; i < num_tile_rows_minus1; i++ ) row_height_minus1[i] ue(v) } */ LOG1("loop_filter_across_tiles_enabled_flag : %d\n", loop_filter_across_tiles_enabled_flag); } LOG1("pps_loop_filter_across_slices_enabled_flag: %d\n", pps_loop_filter_across_slices_enabled_flag); LOG1("deblocking_filter_control_present_flag: %d\n", deblocking_filter_control_present_flag); if (deblocking_filter_control_present_flag) { LOG1("deblocking_filter_override_enabled_flag: %d\n", deblocking_filter_override_enabled_flag); LOG1("pic_disable_deblocking_filter_flag: %d\n", pic_disable_deblocking_filter_flag); LOG1("beta_offset: %d\n", beta_offset); LOG1("tc_offset: %d\n", tc_offset); } LOG1("pic_scaling_list_data_present_flag: %d\n", pic_scaling_list_data_present_flag); if (pic_scaling_list_data_present_flag) { //scaling_list_data() } LOG1("lists_modification_present_flag: %d\n", lists_modification_present_flag); LOG1("log2_parallel_merge_level : %d\n", log2_parallel_merge_level); LOG1("num_extra_slice_header_bits : %d\n", num_extra_slice_header_bits); LOG1("slice_segment_header_extension_present_flag : %d\n", slice_segment_header_extension_present_flag); LOG1("pps_extension_flag : %d\n", pps_extension_flag); LOG1("pps_range_extension_flag : %d\n", pps_range_extension_flag); LOG1("pps_multilayer_extension_flag : %d\n", pps_multilayer_extension_flag); LOG1("pps_extension_6bits : %d\n", pps_extension_6bits); LOG1("Log2MinCuQpDeltaSize : %d\n", Log2MinCuQpDeltaSize); LOG1("Log2MinCuChromaQpOffsetSize (RExt) : %d\n", Log2MinCuChromaQpOffsetSize); LOG1("Log2MaxTransformSkipSize (RExt) : %d\n", Log2MaxTransformSkipSize); #undef LOG0 #undef LOG1 if (pps_range_extension_flag) { range_extension.dump(fd); } } bool pic_parameter_set::is_tile_start_CTB(int ctbX,int ctbY) const { // fast check if (tiles_enabled_flag==0) { return ctbX == 0 && ctbY == 0; } for (int i=0;i * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_PPS_H #define DE265_PPS_H #include "libde265/bitstream.h" #include "libde265/sps.h" // for scaling list only #include #include constexpr int DE265_MAX_TILE_COLUMNS = 10; constexpr int DE265_MAX_TILE_ROWS = 10; class decoder_context; class pic_parameter_set; class pps_range_extension { public: pps_range_extension() { reset(); } void reset(); bool read(bitreader*, decoder_context*, const pic_parameter_set*); void dump(int fd) const; uint8_t log2_max_transform_skip_block_size; bool cross_component_prediction_enabled_flag; bool chroma_qp_offset_list_enabled_flag; uint8_t diff_cu_chroma_qp_offset_depth; uint8_t chroma_qp_offset_list_len; int8_t cb_qp_offset_list[6]; int8_t cr_qp_offset_list[6]; uint8_t log2_sao_offset_scale_luma; uint8_t log2_sao_offset_scale_chroma; }; class pic_parameter_set { public: pic_parameter_set(); ~pic_parameter_set(); void reset() { set_defaults(); } bool read(bitreader*, decoder_context*); bool write(error_queue*, CABAC_encoder&, const seq_parameter_set* sps); bool is_tile_start_CTB(int ctbX,int ctbY) const; void dump(int fd) const; void set_defaults(enum PresetSet = Preset_Default); bool pps_read; // whether this pps has been read from bitstream std::shared_ptr sps; uint8_t pic_parameter_set_id; uint8_t seq_parameter_set_id; bool dependent_slice_segments_enabled_flag; bool sign_data_hiding_flag; bool cabac_init_present_flag; uint8_t num_ref_idx_l0_default_active; // [1;16] uint8_t num_ref_idx_l1_default_active; // [1;16] int pic_init_qp; bool constrained_intra_pred_flag; bool transform_skip_enabled_flag; // --- QP --- bool cu_qp_delta_enabled_flag; uint8_t diff_cu_qp_delta_depth; // [ 0 ; log2_diff_max_min_luma_coding_block_size ] int pic_cb_qp_offset; int pic_cr_qp_offset; bool pps_slice_chroma_qp_offsets_present_flag; bool weighted_pred_flag; bool weighted_bipred_flag; bool output_flag_present_flag; bool transquant_bypass_enable_flag; bool entropy_coding_sync_enabled_flag; // --- tiles --- bool tiles_enabled_flag; uint8_t num_tile_columns; // [1;PicWidthInCtbsY] max DE265_MAX_TILE_COLUMNS uint8_t num_tile_rows; // [1;PicHeightInCtbsY] max DE265_MAX_TILE_ROWS bool uniform_spacing_flag; // --- --- bool loop_filter_across_tiles_enabled_flag; bool pps_loop_filter_across_slices_enabled_flag; bool deblocking_filter_control_present_flag; bool deblocking_filter_override_enabled_flag; bool pic_disable_deblocking_filter_flag; int8_t beta_offset; // [-12;12] int8_t tc_offset; // [-12;12] bool pic_scaling_list_data_present_flag; struct scaling_list_data scaling_list; // contains valid data if sps->scaling_list_enabled_flag set bool lists_modification_present_flag; uint8_t log2_parallel_merge_level; // [2 ; log2(max CB size)] uint8_t num_extra_slice_header_bits; bool slice_segment_header_extension_present_flag; bool pps_extension_flag; bool pps_range_extension_flag; bool pps_multilayer_extension_flag; uint8_t pps_extension_6bits; pps_range_extension range_extension; // --- derived values --- int Log2MinCuQpDeltaSize; int Log2MinCuChromaQpOffsetSize; int Log2MaxTransformSkipSize; int colWidth [ DE265_MAX_TILE_COLUMNS ]; int rowHeight[ DE265_MAX_TILE_ROWS ]; int colBd [ DE265_MAX_TILE_COLUMNS+1 ]; int rowBd [ DE265_MAX_TILE_ROWS+1 ]; std::vector CtbAddrRStoTS; // #CTBs std::vector CtbAddrTStoRS; // #CTBs std::vector TileId; // #CTBs // index in tile-scan order std::vector TileIdRS; // #CTBs // index in raster-scan order std::vector MinTbAddrZS; // #TBs [x + y*PicWidthInTbsY] void set_derived_values(const seq_parameter_set* sps); }; #endif libde265-1.0.18/libde265/quality.cc000066400000000000000000000050641515675107500164510ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "quality.h" #include uint32_t SSD(const uint8_t* img, int imgStride, const uint8_t* ref, int refStride, int width, int height) { uint32_t sum=0; const uint8_t* iPtr = img; const uint8_t* rPtr = ref; for (int y=0;yget_image_plane_at_pos(cIdx,x0,y0), img1->get_image_stride(cIdx), img2->get_image_plane_at_pos(cIdx,x0,y0), img2->get_image_stride(cIdx), 1< * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_QUALITY_H #define DE265_QUALITY_H #include #include #include LIBDE265_API uint32_t SSD(const uint8_t* img, int imgStride, const uint8_t* ref, int refStride, int width, int height); LIBDE265_API uint32_t SAD(const uint8_t* img, int imgStride, const uint8_t* ref, int refStride, int width, int height); LIBDE265_API double MSE(const uint8_t* img, int imgStride, const uint8_t* ref, int refStride, int width, int height); LIBDE265_API double PSNR(double mse); LIBDE265_API uint32_t compute_distortion_ssd(const de265_image* img1, const de265_image* img2, int x0, int y0, int log2size, int cIdx); #endif libde265-1.0.18/libde265/refpic.cc000066400000000000000000000334341515675107500162330ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "refpic.h" #include "decctx.h" #include "util.h" #include #include #if defined(_MSC_VER) || defined(__MINGW32__) # include #elif defined(HAVE_ALLOCA_H) # include #endif void ref_pic_set::reset() { NumNegativePics = 0; NumPositivePics = 0; NumDeltaPocs = 0; NumPocTotalCurr_shortterm_only = 0; for (int i=0;i& sets, // previously read sets bool sliceRefPicSet) // is this in the slice header? { // --- is this set coded in prediction mode (not possible for the first set) char inter_ref_pic_set_prediction_flag; if (idxRps != 0) { inter_ref_pic_set_prediction_flag = br->get_bits(1); } else { inter_ref_pic_set_prediction_flag = 0; } if (inter_ref_pic_set_prediction_flag) { uint32_t vlc; /* Only for the last ref_pic_set (that's the one coded in the slice header), we can specify relative to which reference set we code the set. */ int delta_idx; if (sliceRefPicSet) { // idxRps == num_short_term_ref_pic_sets) { delta_idx = vlc = br->get_uvlc(); if (vlc==UVLC_ERROR) { return false; } if (delta_idx>=idxRps) { return false; } delta_idx++; } else { delta_idx = 1; } assert(idxRps >= delta_idx); int RIdx = idxRps - delta_idx; // this is our source set, which we will modify (TODO: change type to uint8_t) int delta_rps_sign = br->get_bits(1); vlc = br->get_uvlc(); // abs_delta_rps_minus1 shall be in [0, 2^15-1] (Sec. 7.4.8) if (vlc==UVLC_ERROR || vlc > 32767) { return false; } uint16_t abs_delta_rps = vlc + 1; int DeltaRPS = (delta_rps_sign ? -abs_delta_rps : abs_delta_rps); // bits are stored in this order: // - all bits for negative Pocs (forward), // - then all bits for positive Pocs (forward), // - then bits for '0', shifting of the current picture // in total, these are 'nDeltaPocsRIdx'+1 bits logtrace(LogHeaders,"predicted from %d with delta %d\n",RIdx,DeltaRPS); int nDeltaPocsRIdx= sets[RIdx].NumDeltaPocs; // size of source set char *const used_by_curr_pic_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char)); char *const use_delta_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char)); for (int j=0;j<=nDeltaPocsRIdx;j++) { used_by_curr_pic_flag[j] = br->get_bits(1); if (used_by_curr_pic_flag[j]) { use_delta_flag[j] = 1; // if this frame is used, we also have to apply the delta } else { use_delta_flag[j] = br->get_bits(1); // otherwise, it is only optionally included } } logtrace(LogHeaders,"flags: "); for (int j=0;j<=nDeltaPocsRIdx;j++) { logtrace(LogHeaders,"%d ", use_delta_flag[j]); } logtrace(LogHeaders,"\n"); int nNegativeRIdx = sets[RIdx].NumNegativePics; int nPositiveRIdx = sets[RIdx].NumPositivePics; // --- update list 0 (negative Poc) --- // Iterate through all Pocs in decreasing value order (positive reverse, 0, negative forward). int i=0; // target index // positive list for (int j=nPositiveRIdx-1;j>=0;j--) { assert(RIdx >= 0 && static_cast(RIdx) < sets.size()); assert(j>=0 && j < MAX_NUM_REF_PICS); int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS; // new delta if (dPoc<0 && use_delta_flag[nNegativeRIdx+j]) { if (i>= MAX_NUM_REF_PICS) { return false; } out_set->DeltaPocS0[i] = dPoc; out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nNegativeRIdx+j]; i++; } } // frame 0 if (DeltaRPS<0 && use_delta_flag[nDeltaPocsRIdx]) { if (i>= MAX_NUM_REF_PICS) { return false; } out_set->DeltaPocS0[i] = DeltaRPS; out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nDeltaPocsRIdx]; i++; } // negative list for (int j=0;j= MAX_NUM_REF_PICS) { return false; } out_set->DeltaPocS0[i] = dPoc; out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[j]; i++; } } out_set->NumNegativePics = i; // --- update list 1 (positive Poc) --- // Iterate through all Pocs in increasing value order (negative reverse, 0, positive forward) i=0; // target index // negative list for (int j=nNegativeRIdx-1;j>=0;j--) { int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS; if (dPoc>0 && use_delta_flag[j]) { if (i>= MAX_NUM_REF_PICS) { return false; } out_set->DeltaPocS1[i] = dPoc; out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[j]; i++; } } // frame 0 if (DeltaRPS>0 && use_delta_flag[nDeltaPocsRIdx]) { if (i>= MAX_NUM_REF_PICS) { return false; } out_set->DeltaPocS1[i] = DeltaRPS; out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nDeltaPocsRIdx]; i++; } // positive list for (int j=0;j0 && use_delta_flag[nNegativeRIdx+j]) { if (i>= MAX_NUM_REF_PICS) { return false; } out_set->DeltaPocS1[i] = dPoc; out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nNegativeRIdx+j]; i++; } } out_set->NumPositivePics = i; } else { // --- first, read the number of past and future frames in this set --- uint32_t num_negative_pics = br->get_uvlc(); uint32_t num_positive_pics = br->get_uvlc(); if (num_negative_pics == UVLC_ERROR || num_positive_pics == UVLC_ERROR || num_negative_pics > MAX_NUM_REF_PICS || num_positive_pics > MAX_NUM_REF_PICS) { errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); return false; } // total number of reference pictures may not exceed buffer capacity if (num_negative_pics + num_positive_pics > static_cast(sps->sps_max_dec_pic_buffering[ sps->sps_max_sub_layers-1 ])) { out_set->NumNegativePics = 0; out_set->NumPositivePics = 0; out_set->NumDeltaPocs = 0; out_set->NumPocTotalCurr_shortterm_only = 0; errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); return false; } out_set->NumNegativePics = num_negative_pics; out_set->NumPositivePics = num_positive_pics; // --- now, read the deltas between the reference frames to fill the lists --- // past frames int16_t lastPocS=0; for (uint32_t i=0;iget_uvlc(); if (delta_poc_s0==UVLC_ERROR) { return false; } delta_poc_s0++; char used_by_curr_pic_s0_flag = br->get_bits(1); if (delta_poc_s0 > static_cast(lastPocS - INT16_MIN)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return false; } // cast to int because delta_poc_s0 can be up to 32768 when lastPocS==0 int16_t pocS = lastPocS - static_cast(delta_poc_s0); out_set->DeltaPocS0[i] = pocS; out_set->UsedByCurrPicS0[i] = used_by_curr_pic_s0_flag; lastPocS = pocS; } // future frames lastPocS = 0; for (uint32_t i=0;iget_uvlc(); if (delta_poc_s1==UVLC_ERROR) { return false; } delta_poc_s1++; char used_by_curr_pic_s1_flag = br->get_bits(1); if (delta_poc_s1 > static_cast(INT16_MAX - lastPocS)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return false; } int16_t pocS = lastPocS + static_cast(delta_poc_s1); out_set->DeltaPocS1[i] = pocS; out_set->UsedByCurrPicS1[i] = used_by_curr_pic_s1_flag; lastPocS = pocS; } } out_set->compute_derived_values(); return true; } bool write_short_term_ref_pic_set_nopred(error_queue* errqueue, const seq_parameter_set* sps, CABAC_encoder& out, const ref_pic_set* in_set, // which set to write int idxRps, // index of the set to be written const std::vector& sets, // previously read sets bool sliceRefPicSet) // is this in the slice header? { if (idxRps != 0) { // inter_ref_pic_set_prediction_flag out.write_bit(0); } // --- first, write the number of past and future frames in this set --- out.write_uvlc(in_set->NumNegativePics); out.write_uvlc(in_set->NumPositivePics); // --- now, write the deltas between the reference frames to fill the lists --- // past frames int lastPocS=0; for (int i=0;iNumNegativePics;i++) { int delta_poc_s0 = lastPocS - in_set->DeltaPocS0[i]; char used_by_curr_pic_s0_flag = in_set->UsedByCurrPicS0[i]; assert(delta_poc_s0 >= 1); out.write_uvlc(delta_poc_s0-1); out.write_bit(used_by_curr_pic_s0_flag); lastPocS = in_set->DeltaPocS0[i]; } // future frames lastPocS=0; for (int i=0;iNumPositivePics;i++) { int delta_poc_s1 = in_set->DeltaPocS1[i] - lastPocS; char used_by_curr_pic_s1_flag = in_set->UsedByCurrPicS1[i]; assert(delta_poc_s1 >= 1); out.write_uvlc(delta_poc_s1-1); out.write_bit(used_by_curr_pic_s1_flag); lastPocS = in_set->DeltaPocS1[i]; } return true; } bool write_short_term_ref_pic_set(error_queue* errqueue, const seq_parameter_set* sps, CABAC_encoder& out, const ref_pic_set* in_set, // which set to write int idxRps, // index of the set to be read const std::vector& sets, // previously read sets bool sliceRefPicSet) // is this in the slice header? { return write_short_term_ref_pic_set_nopred(errqueue, sps, out, in_set, idxRps, sets, sliceRefPicSet); } void dump_short_term_ref_pic_set(const ref_pic_set* set, FILE* fh) { log2fh(fh,"NumDeltaPocs: %d [-:%d +:%d]\n", set->NumDeltaPocs, set->NumNegativePics, set->NumPositivePics); log2fh(fh,"DeltaPocS0:"); for (int i=0;iNumNegativePics;i++) { if (i) { log2fh(fh,","); } log2fh(fh," %d/%d",set->DeltaPocS0[i],set->UsedByCurrPicS0[i]); } log2fh(fh,"\n"); log2fh(fh,"DeltaPocS1:"); for (int i=0;iNumPositivePics;i++) { if (i) { log2fh(fh,","); } log2fh(fh," %d/%d",set->DeltaPocS1[i],set->UsedByCurrPicS1[i]); } log2fh(fh,"\n"); } void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh) { char *const log = (char *)alloca((range+1+range+1) * sizeof(char)); log[2*range+1] = 0; for (int i=0;i<2*range+1;i++) log[i]='.'; log[range]='|'; for (int i=set->NumNegativePics-1;i>=0;i--) { int n = set->DeltaPocS0[i]; if (n>=-range && n<=range) { if (set->UsedByCurrPicS0[i]) log[n+range] = 'X'; else log[n+range] = 'o'; } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS0[i] ? 'X':'o'); } } for (int i=set->NumPositivePics-1;i>=0;i--) { int n = set->DeltaPocS1[i]; if (n>=-range && n<=range) { if (set->UsedByCurrPicS1[i]) log[n+range] = 'X'; else log[n+range] = 'o'; } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS1[i] ? 'X':'o'); } } log2fh(fh,"*%s\n",log); } libde265-1.0.18/libde265/refpic.h000066400000000000000000000043261515675107500160730ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_REFPIC_H #define DE265_REFPIC_H #include "libde265/bitstream.h" #include constexpr int MAX_NUM_REF_PICS = 16; // maximum defined by standard, may be lower for some Levels class ref_pic_set { public: // Lists of pictures that have to be kept in the decoded picture buffer for future // reference and that may optionally be used for prediction in the current frame. // Lists contain the relative POC positions. int16_t DeltaPocS0[MAX_NUM_REF_PICS]; // sorted in decreasing order (e.g. -1, -2, -4, -7, ...) int16_t DeltaPocS1[MAX_NUM_REF_PICS]; // sorted in ascending order (e.g. 1, 2, 4, 7) // flag for each reference whether this is actually used for prediction in the current frame uint8_t UsedByCurrPicS0[MAX_NUM_REF_PICS]; uint8_t UsedByCurrPicS1[MAX_NUM_REF_PICS]; uint8_t NumNegativePics; // number of past reference pictures uint8_t NumPositivePics; // number of future reference pictures // --- derived values --- void compute_derived_values(); uint8_t NumDeltaPocs; // total number of reference pictures (past + future) uint8_t NumPocTotalCurr_shortterm_only; /* Total number of reference pictures that may actually be used for prediction in the current frame. */ void reset(); }; void dump_short_term_ref_pic_set(const ref_pic_set*, FILE* fh); void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh); #endif libde265-1.0.18/libde265/sao.cc000066400000000000000000000421651515675107500155460ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "sao.h" #include "util.h" #include #include template void apply_sao_internal(de265_image* img, int xCtb,int yCtb, const slice_segment_header* shdr, int cIdx, int nSW,int nSH, const pixel_t* in_img, int in_stride, /* */ pixel_t* out_img, int out_stride) { const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb); int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3; logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH); if (SaoTypeIdx==0) { return; } const seq_parameter_set* sps = &img->get_sps(); const pic_parameter_set* pps = &img->get_pps(); const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C); const int maxPixelValue = (1<get_width(cIdx); const int height = img->get_height(cIdx); const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS; const int picWidthInCtbs = sps->PicWidthInCtbsY; const int chromashiftW = sps->get_chroma_shift_W(cIdx); const int chromashiftH = sps->get_chroma_shift_H(cIdx); const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW; const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH; for (int i=0;i<5;i++) { logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]); } // actual size of CTB to be processed (can be smaller when partially outside of image) const int ctbW = (xC+nSW>width) ? width -xC : nSW; const int ctbH = (yC+nSH>height) ? height-yC : nSH; const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb); if (SaoTypeIdx==2) { int hPos[2], vPos[2]; int vPosStride[2]; // vPos[] multiplied by image stride int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3; switch (SaoEoClass) { case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break; case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break; case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break; case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break; } vPosStride[0] = vPos[0] * in_stride; vPosStride[1] = vPos[1] * in_stride; /* Reorder sao_info.saoOffsetVal[] array, so that we can index it directly with the sum of the two pixel-difference signs. */ int8_t saoOffsetVal[5]; // [2] unused saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1]; saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1]; saoOffsetVal[2] = 0; saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1]; saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1]; for (int j=0;jpcm_loop_filter_disable_flag && img->get_pcm_flag((xC+i)<get_cu_transquant_bypass((xC+i)<=width || yS>=height) { edgeIdx=0; break; } // This part seems inefficient with all the get_SliceHeaderIndex() calls, // but removing this part (because the input was known to have only a single // slice anyway) reduced computation time only by 1.3%. // TODO: however, this may still be a big part of SAO itself. slice_segment_header* sliceHeader = img->get_SliceHeader(xS<SliceAddrRS; if (sliceAddrRS < ctbSliceAddrRS && img->get_SliceHeader((xC+i)<slice_loop_filter_across_slices_enabled_flag==0) { edgeIdx=0; break; } if (sliceAddrRS > ctbSliceAddrRS && img->get_SliceHeader(xS<slice_loop_filter_across_slices_enabled_flag==0) { edgeIdx=0; break; } if (pps->loop_filter_across_tiles_enabled_flag==0 && pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] != pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) { edgeIdx=0; break; } } if (edgeIdx != 0) { edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) + Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) ); if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table) int offset = saoOffsetVal[edgeIdx+2]; out_ptr[i] = Clip3(0,maxPixelValue, in_ptr[i] + offset); } } } } } else { int bandShift = bitDepth-5; int saoLeftClass = saoinfo->sao_band_position[cIdx]; logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass); int bandTable[32]; memset(bandTable, 0, sizeof(int)*32); for (int k=0;k<4;k++) { bandTable[ (k+saoLeftClass)&31 ] = k+1; } /* If PCM or transquant_bypass is used in this CTB, we have to run all checks (A). Otherwise, we run a simplified version of the code (B). NOTE: this whole part of SAO does not seem to be a significant part of the time spent */ if (extendedTests) { // (A) full version with all checks for (int j=0;jpcm_loop_filter_disable_flag && img->get_pcm_flag((xC+i)<get_cu_transquant_bypass((xC+i)<>x actually computes >>(x%64). // But this should never happen, because the maximum bit-depth is 16. int pixel = in_img[xC + i + (yC + j) * in_stride]; // Note: the input pixel value should never exceed the valid range, but it seems that it still does, // maybe when there was a decoding error and the pixels have not been filled in correctly. // Thus, we have to limit the pixel range to ensure that we have no illegal table access. pixel = Clip3(0, maxPixelValue, pixel); int bandIdx = bandTable[pixel >> bandShift]; if (bandIdx>0) { int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx, offset, in_img[xC+i+(yC+j)*in_stride], in_img[xC+i+(yC+j)*in_stride]+offset); out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, in_img[xC+i+(yC+j)*in_stride] + offset); } } } else { // (B) simplified version (only works if no PCM and transquant_bypass is active) for (int j=0;j> bandShift]; if (bandIdx>0) { int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, in_img[xC+i+(yC+j)*in_stride] + offset); } } } } } template void apply_sao(de265_image* img, int xCtb,int yCtb, const slice_segment_header* shdr, int cIdx, int nSW,int nSH, const pixel_t* in_img, int in_stride, /* */ pixel_t* out_img, int out_stride) { if (img->high_bit_depth(cIdx)) { apply_sao_internal(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, reinterpret_cast(in_img), in_stride, reinterpret_cast(out_img),out_stride); } else { apply_sao_internal(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, in_img, in_stride, out_img,out_stride); } } void apply_sample_adaptive_offset(de265_image* img) { const seq_parameter_set& sps = img->get_sps(); if (sps.sample_adaptive_offset_enabled_flag==0) { return; } de265_image inputCopy; de265_error err = inputCopy.copy_image(img); if (err != DE265_OK) { img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); return; } for (int yCtb=0; yCtbget_SliceHeaderCtb(xCtb,yCtb); if (shdr->slice_sao_luma_flag) { apply_sao(img, xCtb,yCtb, shdr, 0, 1<get_image_plane(0), img->get_image_stride(0)); } if (shdr->slice_sao_chroma_flag) { int nSW = (1<get_image_plane(1), img->get_image_stride(1)); apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH, inputCopy.get_image_plane(2), inputCopy.get_image_stride(2), img->get_image_plane(2), img->get_image_stride(2)); } } } void apply_sample_adaptive_offset_sequential(de265_image* img) { const seq_parameter_set& sps = img->get_sps(); if (sps.sample_adaptive_offset_enabled_flag==0) { return; } int lumaImageSize = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0); int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1); uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ]; if (inputCopy == nullptr) { img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); return; } int nChannels = 3; if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; } for (int cIdx=0;cIdxget_image_stride(cIdx); int height = img->get_height(cIdx); memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx)); for (int yCtb=0; yCtbget_SliceHeaderCtb(xCtb,yCtb); if (shdr==nullptr) { delete[] inputCopy; return; } if (cIdx==0 && shdr->slice_sao_luma_flag) { apply_sao(img, xCtb,yCtb, shdr, 0, 1<get_image_plane(0), img->get_image_stride(0)); } if (cIdx!=0 && shdr->slice_sao_chroma_flag) { int nSW = (1<get_image_plane(cIdx), img->get_image_stride(cIdx)); } } } delete[] inputCopy; } class thread_task_sao : public thread_task { public: int ctb_y; de265_image* img; /* this is where we get the SPS from (either inputImg or outputImg can be a dummy image) */ de265_image* inputImg; de265_image* outputImg; int inputProgress; virtual void work(); virtual std::string name() const { char buf[100]; sprintf(buf,"sao-%d",ctb_y); return buf; } }; void thread_task_sao::work() { state = Running; img->thread_run(this); const seq_parameter_set& sps = img->get_sps(); const int rightCtb = sps.PicWidthInCtbsY-1; const int ctbSize = (1<wait_for_progress(this, rightCtb,ctb_y, inputProgress); if (ctb_y>0) { img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress); } if (ctb_y+1wait_for_progress(this, rightCtb,ctb_y+1, inputProgress); } // copy input image to output for this CTB-row outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize); // process SAO in the CTB-row for (int xCtb=0; xCtbget_SliceHeaderCtb(xCtb,ctb_y); if (shdr==nullptr) { break; } if (shdr->slice_sao_luma_flag) { apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize, inputImg ->get_image_plane(0), inputImg ->get_image_stride(0), outputImg->get_image_plane(0), outputImg->get_image_stride(0)); } if (shdr->slice_sao_chroma_flag) { int nSW = ctbSize / sps.SubWidthC; int nSH = ctbSize / sps.SubHeightC; apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH, inputImg ->get_image_plane(1), inputImg ->get_image_stride(1), outputImg->get_image_plane(1), outputImg->get_image_stride(1)); apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH, inputImg ->get_image_plane(2), inputImg ->get_image_stride(2), outputImg->get_image_plane(2), outputImg->get_image_stride(2)); } } // mark SAO progress for (int x=0;x<=rightCtb;x++) { const int CtbWidth = sps.PicWidthInCtbsY; img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO); } state = Finished; img->thread_finishes(this); } bool add_sao_tasks(image_unit* imgunit, int saoInputProgress) { de265_image* img = imgunit->img; const seq_parameter_set& sps = img->get_sps(); if (sps.sample_adaptive_offset_enabled_flag==0) { return false; } decoder_context* ctx = img->decctx; de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(), img->get_chroma_format(), img->get_shared_sps(), false, img->decctx, //img->encctx, img->pts, img->user_data, true); if (err != DE265_OK) { img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); return false; } int nRows = sps.PicHeightInCtbsY; img->thread_start(nRows); for (int y=0;yinputImg = img; task->outputImg = &imgunit->sao_output; task->img = img; task->ctb_y = y; task->inputProgress = saoInputProgress; imgunit->tasks.push_back(task); ctx->thread_pool_.add_task(task); } /* Currently need barrier here because when are finished, we have to swap the pixel data back into the main image. */ img->wait_for_completion(); img->exchange_pixel_data_with(imgunit->sao_output); return true; } libde265-1.0.18/libde265/sao.h000066400000000000000000000023431515675107500154020ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_SAO_H #define DE265_SAO_H #include "libde265/decctx.h" void apply_sample_adaptive_offset(de265_image* img); /* requires less memory than the function above */ void apply_sample_adaptive_offset_sequential(de265_image* img); /* saoInputProgress - the CTB progress that SAO will wait for before beginning processing. Returns 'true' if any tasks have been added. */ bool add_sao_tasks(image_unit* imgunit, int saoInputProgress); #endif libde265-1.0.18/libde265/scan.cc000066400000000000000000000103561515675107500157050ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "scan.h" static position scan0 = { 0,0 }; static position scan_h_1[ 2* 2], scan_v_1[ 2* 2], scan_d_1[ 2* 2]; static position scan_h_2[ 4* 4], scan_v_2[ 4* 4], scan_d_2[ 4* 4]; static position scan_h_3[ 8* 8], scan_v_3[ 8* 8], scan_d_3[ 8* 8]; static position scan_h_4[16*16], scan_v_4[16*16], scan_d_4[16*16]; static position scan_h_5[32*32], scan_v_5[32*32], scan_d_5[32*32]; static position* scan_h[7] = { &scan0,scan_h_1,scan_h_2,scan_h_3,scan_h_4,scan_h_5 }; static position* scan_v[7] = { &scan0,scan_v_1,scan_v_2,scan_v_3,scan_v_4,scan_v_5 }; static position* scan_d[7] = { &scan0,scan_d_1,scan_d_2,scan_d_3,scan_d_4,scan_d_5 }; static void init_scan_h(position* scan, int blkSize) { int i=0; for (int y=0;y=0) { if (xsubBlock = lastSubBlock; pos->scanPos = lastScanPos; } void init_scan_orders() { for (int log2size=1;log2size<=5;log2size++) { init_scan_h(scan_h[log2size], 1< * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_SCAN_H #define DE265_SCAN_H #include typedef struct { uint8_t x,y; } position; typedef struct { uint8_t subBlock; uint8_t scanPos; } scan_position; void init_scan_orders(); /* scanIdx: 0 - diag, 1 - horiz, 2 - verti */ const position* get_scan_order(int log2BlockSize, int scanIdx); scan_position get_scan_position(int x,int y, int scanIdx, int log2BlkSize); #endif libde265-1.0.18/libde265/sei.cc000066400000000000000000000316721515675107500155450ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "sei.h" #include "util.h" #include "md5.h" #include "libde265/sps.h" #include "libde265/image.h" #include "libde265/decctx.h" #include static de265_error read_sei_decoded_picture_hash(bitreader* reader, sei_message* sei, const seq_parameter_set* sps) { sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; seihash->hash_type = (enum sei_decoded_picture_hash_type)reader->get_bits(8); if (sps==nullptr) { return DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI; } int nHashes = sps->chroma_format_idc==0 ? 1 : 3; for (int i=0;ihash_type) { case sei_decoded_picture_hash_type_MD5: for (int b=0;b<16;b++) { seihash->md5[i][b] = reader->get_bits(8); } break; case sei_decoded_picture_hash_type_CRC: seihash->crc[i] = reader->get_bits(16); break; case sei_decoded_picture_hash_type_checksum: seihash->checksum[i] = reader->get_bits(32); break; } } return DE265_OK; } static void dump_sei_decoded_picture_hash(const sei_message* sei, const seq_parameter_set* sps) { const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; loginfo(LogSEI," hash_type: "); switch (seihash->hash_type) { case sei_decoded_picture_hash_type_MD5: loginfo(LogSEI,"MD5\n"); break; case sei_decoded_picture_hash_type_CRC: loginfo(LogSEI,"CRC\n"); break; case sei_decoded_picture_hash_type_checksum: loginfo(LogSEI,"checksum\n"); break; } int nHashes = sps->chroma_format_idc==0 ? 1 : 3; for (int i=0;ihash_type) { case sei_decoded_picture_hash_type_MD5: loginfo(LogSEI," MD5[%d]: %02x", i,seihash->md5[i][0]); for (int b=1;b<16;b++) { loginfo(LogSEI,"*:%02x", seihash->md5[i][b]); } loginfo(LogSEI,"*\n"); break; case sei_decoded_picture_hash_type_CRC: loginfo(LogSEI," CRC[%d]: %02x\n", i,seihash->crc[i]); break; case sei_decoded_picture_hash_type_checksum: loginfo(LogSEI," checksum[%d]: %04x\n", i,seihash->checksum[i]); break; } } } class raw_hash_data { public: raw_hash_data(int w, int stride); ~raw_hash_data(); struct data_chunk { const uint8_t* data; int len; }; data_chunk prepare_8bit(const uint8_t* data,int y); data_chunk prepare_16bit(const uint8_t* data,int y); private: int mWidth, mStride; uint8_t* mMem; }; raw_hash_data::raw_hash_data(int w, int stride) { mWidth=w; mStride=stride; mMem = nullptr; } raw_hash_data::~raw_hash_data() { delete[] mMem; } raw_hash_data::data_chunk raw_hash_data::prepare_8bit(const uint8_t* data,int y) { data_chunk chunk; chunk.data = data+y*mStride; chunk.len = mWidth; return chunk; } raw_hash_data::data_chunk raw_hash_data::prepare_16bit(const uint8_t* data,int y) { if (mMem == nullptr) { mMem = new uint8_t[2*mWidth]; } const uint16_t* data16 = (uint16_t*)data; for (int x=0; x> 8; } data_chunk chunk; chunk.data = mMem; chunk.len = 2*mWidth; return chunk; } static uint32_t compute_checksum(uint8_t* data,int w,int h,int stride, int bit_depth) { uint32_t sum = 0; if (bit_depth<=8) { for (int y=0; y> 8 ) ^ ( y >> 8 ); sum += data[y*stride + x] ^ xorMask; } } else { auto* data16 = reinterpret_cast(data); int stride16 = stride / 2; for (int y=0; y> 8 ) ^ ( y >> 8 ); sum += (data16[y*stride16 + x] & 0xFF) ^ xorMask; sum += (data16[y*stride16 + x] >> 8) ^ xorMask; } } return sum & 0xFFFFFFFF; } /* static inline uint16_t crc_process_byte(uint16_t crc, uint8_t byte) { for (int bit=0;bit<8;bit++) { int bitVal = (byte >> (7-bit)) & 1; int crcMsb = (crc>>15) & 1; crc = (((crc<<1) + bitVal) & 0xFFFF); if (crcMsb) { crc ^= 0x1021; } } return crc; } static uint16_t compute_CRC_8bit_old(const uint8_t* data,int w,int h,int stride) { uint16_t crc = 0xFFFF; for (int y=0; y> 8); uint16_t t = s ^ (s >> 4); return ((crc << 8) ^ t ^ (t << 5) ^ (t << 12)) & 0xFFFF; } static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride, int bit_depth) { raw_hash_data raw_data(w,stride); uint16_t crc = 0xFFFF; crc = crc_process_byte_parallel(crc, 0); crc = crc_process_byte_parallel(crc, 0); for (int y=0; y8) chunk = raw_data.prepare_16bit(data, y); else chunk = raw_data.prepare_8bit(data, y); for(int x=0; x8) chunk = raw_data.prepare_16bit(data, y); else chunk = raw_data.prepare_8bit(data, y); MD5_Update(&md5, (void*)chunk.data, chunk.len); } MD5_Final(result, &md5); } static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de265_image* img) { const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; /* Do not check SEI on pictures that are not output. Hash may be wrong, because of a broken link (BLA). This happens, for example in conformance stream RAP_B, where a EOS-NAL appears before a CRA (POC=32). */ if (img->PicOutputFlag == false) { return DE265_OK; } //write_picture(img); int nHashes = img->get_sps().chroma_format_idc==0 ? 1 : 3; for (int i=0;iget_width(i); h = img->get_height(i); data = img->get_image_plane(i); stride = img->get_image_stride(i); switch (seihash->hash_type) { case sei_decoded_picture_hash_type_MD5: { uint8_t md5[16]; compute_MD5(data,w,h,stride,md5, img->get_bit_depth(i)); /* fprintf(stderr,"computed MD5: "); for (int b=0;b<16;b++) { fprintf(stderr,"%02x", md5[b]); } fprintf(stderr,"\n"); */ for (int b=0;b<16;b++) { if (md5[b] != seihash->md5[i][b]) { /* fprintf(stderr,"SEI decoded picture MD5 mismatch (POC=%d)\n", img->PicOrderCntVal); */ return DE265_ERROR_CHECKSUM_MISMATCH; } } } break; case sei_decoded_picture_hash_type_CRC: { uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride, img->get_bit_depth(i)); logtrace(LogSEI,"SEI decoded picture hash: %04x <-[%d]-> decoded picture: %04x\n", seihash->crc[i], i, crc); if (crc != seihash->crc[i]) { /* fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n", seihash->crc[i], crc, img->PicOrderCntVal); */ return DE265_ERROR_CHECKSUM_MISMATCH; } } break; case sei_decoded_picture_hash_type_checksum: { uint32_t chksum = compute_checksum(data,w,h,stride, img->get_bit_depth(i)); if (chksum != seihash->checksum[i]) { /* fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n", seihash->checksum[i], chksum, img->PicOrderCntVal); */ return DE265_ERROR_CHECKSUM_MISMATCH; } } break; } } loginfo(LogSEI,"decoded picture hash checked: OK\n"); //printf("checked picture %d SEI: OK\n", img->PicOrderCntVal); return DE265_OK; } #define MAX_SEI_SIZE UINT32_C(0xFFFFFFFF) de265_error read_sei(bitreader* reader, sei_message* sei, bool suffix, const seq_parameter_set* sps) { uint16_t payload_type = 0; for (;;) { uint8_t byte = static_cast(reader->get_bits(8)); if (std::numeric_limits::max() - byte < payload_type) { return DE265_ERROR_CANNOT_PROCESS_SEI; } payload_type += byte; if (byte != 0xFF) { break; } } //printf("SEI payload: %d\n",payload_type); uint32_t payload_size = 0; for (;;) { uint32_t byte = reader->get_bits(8); if (MAX_SEI_SIZE - byte < payload_type) { return DE265_ERROR_CANNOT_PROCESS_SEI; } payload_size += byte; if (byte != 0xFF) { break; } } sei->payload_type = payload_type; sei->payload_size = payload_size; // --- sei message dispatch de265_error err = DE265_OK; switch (sei->payload_type) { case sei_payload_type_decoded_picture_hash: err = read_sei_decoded_picture_hash(reader,sei,sps); break; default: // TODO: unknown SEI messages are ignored break; } return err; } void dump_sei(const sei_message* sei, const seq_parameter_set* sps) { loginfo(LogHeaders,"SEI message: %s\n", sei_type_name(sei->payload_type)); switch (sei->payload_type) { case sei_payload_type_decoded_picture_hash: dump_sei_decoded_picture_hash(sei, sps); break; default: // TODO: unknown SEI messages are ignored break; } } de265_error process_sei(const sei_message* sei, de265_image* img) { de265_error err = DE265_OK; switch (sei->payload_type) { case sei_payload_type_decoded_picture_hash: if (img->decctx->param_sei_check_hash) { err = process_sei_decoded_picture_hash(sei, img); if (err==DE265_OK) { //printf("SEI check ok\n"); } } break; default: // TODO: unknown SEI messages are ignored break; } return err; } const char* sei_type_name(uint16_t type) { switch (type) { case sei_payload_type_buffering_period: return "buffering_period"; case sei_payload_type_pic_timing: return "pic_timing"; case sei_payload_type_pan_scan_rect: return "pan_scan_rect"; case sei_payload_type_filler_payload: return "filler_payload"; case sei_payload_type_user_data_registered_itu_t_t35: return "user_data_registered_itu_t_t35"; case sei_payload_type_user_data_unregistered: return "user_data_unregistered"; case sei_payload_type_recovery_point: return "recovery_point"; case sei_payload_type_scene_info: return "scene_info"; case sei_payload_type_picture_snapshot: return "picture_snapshot"; case sei_payload_type_progressive_refinement_segment_start: return "progressive_refinement_segment_start"; case sei_payload_type_progressive_refinement_segment_end: return "progressive_refinement_segment_end"; case sei_payload_type_film_grain_characteristics: return "film_grain_characteristics"; case sei_payload_type_post_filter_hint: return "post_filter_hint"; case sei_payload_type_tone_mapping_info: return "tone_mapping_info"; case sei_payload_type_frame_packing_arrangement: return "frame_packing_arrangement"; case sei_payload_type_display_orientation: return "display_orientation"; case sei_payload_type_structure_of_pictures_info: return "structure_of_pictures_info"; case sei_payload_type_active_parameter_sets: return "active_parameter_sets"; case sei_payload_type_decoding_unit_info: return "decoding_unit_info"; case sei_payload_type_temporal_sub_layer_zero_index: return "temporal_sub_layer_zero_index"; case sei_payload_type_decoded_picture_hash: return "decoded_picture_hash"; case sei_payload_type_scalable_nesting: return "scalable_nesting"; case sei_payload_type_region_refresh_info: return "region_refresh_info"; case sei_payload_type_no_display: return "no_display"; case sei_payload_type_motion_constrained_tile_sets: return "motion_constrained_tile_sets"; default: return "unknown SEI message"; } } libde265-1.0.18/libde265/sei.h000066400000000000000000000055261515675107500154060ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_SEI_H #define DE265_SEI_H #include "libde265/bitstream.h" #include "libde265/de265.h" enum sei_payload_type { sei_payload_type_buffering_period = 0, sei_payload_type_pic_timing = 1, sei_payload_type_pan_scan_rect = 2, sei_payload_type_filler_payload = 3, sei_payload_type_user_data_registered_itu_t_t35 = 4, sei_payload_type_user_data_unregistered = 5, sei_payload_type_recovery_point = 6, sei_payload_type_scene_info = 9, sei_payload_type_picture_snapshot = 15, sei_payload_type_progressive_refinement_segment_start = 16, sei_payload_type_progressive_refinement_segment_end = 17, sei_payload_type_film_grain_characteristics = 19, sei_payload_type_post_filter_hint = 22, sei_payload_type_tone_mapping_info = 23, sei_payload_type_frame_packing_arrangement = 45, sei_payload_type_display_orientation = 47, sei_payload_type_structure_of_pictures_info = 128, sei_payload_type_active_parameter_sets = 129, sei_payload_type_decoding_unit_info = 130, sei_payload_type_temporal_sub_layer_zero_index = 131, sei_payload_type_decoded_picture_hash = 132, sei_payload_type_scalable_nesting = 133, sei_payload_type_region_refresh_info = 134, sei_payload_type_no_display = 135, sei_payload_type_motion_constrained_tile_sets = 136 }; enum sei_decoded_picture_hash_type { sei_decoded_picture_hash_type_MD5 = 0, sei_decoded_picture_hash_type_CRC = 1, sei_decoded_picture_hash_type_checksum = 2 }; typedef struct { enum sei_decoded_picture_hash_type hash_type; uint8_t md5[3][16]; uint16_t crc[3]; uint32_t checksum[3]; } sei_decoded_picture_hash; typedef struct { uint16_t payload_type; // enum sei_payload_type uint32_t payload_size; union { sei_decoded_picture_hash decoded_picture_hash; } data; } sei_message; class seq_parameter_set; const char* sei_type_name(uint16_t type); de265_error read_sei(bitreader* reader, sei_message*, bool suffix, const seq_parameter_set* sps); void dump_sei(const sei_message*, const seq_parameter_set* sps); de265_error process_sei(const sei_message*, struct de265_image* img); #endif libde265-1.0.18/libde265/slice.cc000066400000000000000000004626461515675107500160750ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * Authors: struktur AG, Dirk Farin * Min Chen * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "slice.h" #include "motion.h" #include "util.h" #include "scan.h" #include "intrapred.h" #include "transform.h" #include "threads.h" #include "image.h" #include #include #include #define LOCK de265_mutex_lock(&ctx->thread_pool.mutex) #define UNLOCK de265_mutex_unlock(&ctx->thread_pool.mutex) extern bool read_short_term_ref_pic_set(error_queue* errqueue, const seq_parameter_set* sps, bitreader* br, ref_pic_set* out_set, int idxRps, // index of the set to be read const std::vector& sets, bool sliceRefPicSet); void read_coding_tree_unit(thread_context* tctx); void read_coding_quadtree(thread_context* tctx, int xCtb, int yCtb, int Log2CtbSizeY, int ctDepth); /* void decode_inter_block(decoder_context* ctx,thread_context* tctx, int xC, int yC, int log2CbSize); */ void slice_segment_header::set_defaults() { slice_index = 0; first_slice_segment_in_pic_flag = 1; no_output_of_prior_pics_flag = 0; slice_pic_parameter_set_id = 0; dependent_slice_segment_flag = 0; slice_segment_address = 0; slice_type = SLICE_TYPE_I; pic_output_flag = 1; colour_plane_id = 0; slice_pic_order_cnt_lsb = 0; short_term_ref_pic_set_sps_flag = 1; // ref_pic_set slice_ref_pic_set; short_term_ref_pic_set_idx = 0; num_long_term_sps = 0; num_long_term_pics = 0; //uint8_t lt_idx_sps[MAX_NUM_REF_PICS]; //int poc_lsb_lt[MAX_NUM_REF_PICS]; //char used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS]; //char delta_poc_msb_present_flag[MAX_NUM_REF_PICS]; //int delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS]; slice_temporal_mvp_enabled_flag = 0; slice_sao_luma_flag = 0; slice_sao_chroma_flag = 0; num_ref_idx_active_override_flag = 0; num_ref_idx_l0_active = 1; // [1;16] num_ref_idx_l1_active = 1; // [1;16] ref_pic_list_modification_flag_l0 = 0; ref_pic_list_modification_flag_l1 = 0; //uint8_t list_entry_l0[16]; //uint8_t list_entry_l1[16]; mvd_l1_zero_flag = 0; cabac_init_flag = 0; collocated_from_l0_flag = 0; collocated_ref_idx = 0; // --- pred_weight_table --- luma_log2_weight_denom = 0; // [0;7] ChromaLog2WeightDenom = 0; // [0;7] // first index is L0/L1 /* uint8_t luma_weight_flag[2][16]; // bool uint8_t chroma_weight_flag[2][16]; // bool int16_t LumaWeight[2][16]; int8_t luma_offset[2][16]; int16_t ChromaWeight[2][16][2]; int8_t ChromaOffset[2][16][2]; */ five_minus_max_num_merge_cand = 0; slice_qp_delta = 0; slice_cb_qp_offset = 0; slice_cr_qp_offset = 0; cu_chroma_qp_offset_enabled_flag = 0; deblocking_filter_override_flag = 0; slice_deblocking_filter_disabled_flag = 0; slice_beta_offset = 0; // = pps->beta_offset if undefined slice_tc_offset = 0; // = pps->tc_offset if undefined slice_loop_filter_across_slices_enabled_flag = 0; num_entry_point_offsets = 0; //int offset_len; //std::vector entry_point_offset; slice_segment_header_extension_length = 0; SliceAddrRS = slice_segment_address; } de265_error read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_context* ctx) { uint32_t uvlc; int32_t svlc; pic_parameter_set* pps = ctx->get_pps((int) shdr->slice_pic_parameter_set_id); assert(pps); seq_parameter_set* sps = ctx->get_sps((int) pps->seq_parameter_set_id); assert(sps); uvlc = br->get_uvlc(); if (uvlc > 7) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; shdr->luma_log2_weight_denom = uvlc; if (sps->chroma_format_idc != 0) { svlc = br->get_svlc(); svlc += shdr->luma_log2_weight_denom; if (svlc < 0 || svlc > 7) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; shdr->ChromaLog2WeightDenom = svlc; } int sumWeightFlags[2]{}; for (int l = 0; l <= 1; l++) if (l == 0 || (l == 1 && shdr->slice_type == SLICE_TYPE_B)) { int num_ref = (l == 0 ? shdr->num_ref_idx_l0_active - 1 : shdr->num_ref_idx_l1_active - 1); for (int i = 0; i <= num_ref; i++) { shdr->luma_weight_flag[l][i] = br->get_bits(1); if (shdr->luma_weight_flag[l][i]) sumWeightFlags[l]++; } if (sps->chroma_format_idc != 0) { for (int i = 0; i <= num_ref; i++) { shdr->chroma_weight_flag[l][i] = br->get_bits(1); if (shdr->chroma_weight_flag[l][i]) sumWeightFlags[l] += 2; } } for (int i = 0; i <= num_ref; i++) { if (shdr->luma_weight_flag[l][i]) { // delta_luma_weight svlc = br->get_svlc(); if (svlc < -128 || svlc > 127) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; shdr->LumaWeight[l][i] = (1 << shdr->luma_log2_weight_denom) + svlc; // luma_offset svlc = br->get_svlc(); if (svlc < -sps->WpOffsetHalfRangeY || svlc > sps->WpOffsetHalfRangeY - 1) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; shdr->luma_offset[l][i] = svlc; } else { shdr->LumaWeight[l][i] = 1 << shdr->luma_log2_weight_denom; shdr->luma_offset[l][i] = 0; } if (shdr->chroma_weight_flag[l][i]) for (int j = 0; j < 2; j++) { // delta_chroma_weight svlc = br->get_svlc(); if (svlc < -128 || svlc > 127) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; shdr->ChromaWeight[l][i][j] = (1 << shdr->ChromaLog2WeightDenom) + svlc; // delta_chroma_offset svlc = br->get_svlc(); if (svlc < -4 * sps->WpOffsetHalfRangeC || svlc > 4 * sps->WpOffsetHalfRangeC - 1) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; svlc = Clip3(-sps->WpOffsetHalfRangeC, sps->WpOffsetHalfRangeC-1, (sps->WpOffsetHalfRangeC +svlc -((sps->WpOffsetHalfRangeC*shdr->ChromaWeight[l][i][j]) >> shdr->ChromaLog2WeightDenom))); shdr->ChromaOffset[l][i][j] = svlc; } else { for (int j = 0; j < 2; j++) { shdr->ChromaWeight[l][i][j] = 1 << shdr->ChromaLog2WeightDenom; shdr->ChromaOffset[l][i][j] = 0; } } } } // check sumWeightFlags against limits (H.265, Section 7.4.7.3) if (shdr->slice_type == SLICE_TYPE_P && sumWeightFlags[0] > 24) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (shdr->slice_type == SLICE_TYPE_B && sumWeightFlags[0] + sumWeightFlags[1] > 24) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } return DE265_OK; } void slice_segment_header::reset() { pps = nullptr; slice_index = 0; first_slice_segment_in_pic_flag = 0; no_output_of_prior_pics_flag = 0; slice_pic_parameter_set_id = 0; dependent_slice_segment_flag = 0; slice_segment_address = 0; slice_type = 0; pic_output_flag = 0; colour_plane_id = 0; slice_pic_order_cnt_lsb = 0; short_term_ref_pic_set_sps_flag = 0; slice_ref_pic_set.reset(); short_term_ref_pic_set_idx = 0; num_long_term_sps = 0; num_long_term_pics = 0; for (int i = 0; i < MAX_NUM_REF_PICS; i++) { lt_idx_sps[i] = 0; poc_lsb_lt[i] = 0; used_by_curr_pic_lt_flag[i] = 0; delta_poc_msb_present_flag[i] = 0; delta_poc_msb_cycle_lt[i] = 0; } slice_temporal_mvp_enabled_flag = 0; slice_sao_luma_flag = 0; slice_sao_chroma_flag = 0; num_ref_idx_active_override_flag = 0; num_ref_idx_l0_active = 0; num_ref_idx_l1_active = 0; ref_pic_list_modification_flag_l0 = 0; ref_pic_list_modification_flag_l1 = 0; for (int i = 0; i < 16; i++) { list_entry_l0[i] = 0; list_entry_l1[i] = 0; } mvd_l1_zero_flag = 0; cabac_init_flag = 0; collocated_from_l0_flag = 0; collocated_ref_idx = 0; luma_log2_weight_denom = 0; ChromaLog2WeightDenom = 0; for (int i = 0; i < 2; i++) for (int j = 0; j < 16; j++) { luma_weight_flag[i][j] = 0; chroma_weight_flag[i][j] = 0; LumaWeight[i][j] = 0; luma_offset[i][j] = 0; ChromaWeight[i][j][0] = ChromaWeight[i][j][1] = 0; ChromaOffset[i][j][0] = ChromaOffset[i][j][1] = 0; } five_minus_max_num_merge_cand = 0; slice_qp_delta = 0; slice_cb_qp_offset = 0; slice_cr_qp_offset = 0; cu_chroma_qp_offset_enabled_flag = 0; deblocking_filter_override_flag = 0; slice_deblocking_filter_disabled_flag = 0; slice_beta_offset = 0; slice_tc_offset = 0; slice_loop_filter_across_slices_enabled_flag = 0; num_entry_point_offsets = 0; offset_len = 0; entry_point_offset.clear(); slice_segment_header_extension_length = 0; SliceAddrRS = 0; SliceQPY = 0; initType = 0; MaxNumMergeCand = 0; CurrRpsIdx = 0; CurrRps.reset(); NumPocTotalCurr = 0; for (int i = 0; i < 2; i++) for (int j = 0; j < MAX_NUM_REF_PICS; j++) { RefPicList[i][j] = 0; RefPicList_POC[i][j] = 0; RefPicList_PicState[i][j] = 0; LongTermRefPic[i][j] = 0; } //context_model ctx_model_storage[CONTEXT_MODEL_TABLE_LENGTH]; RemoveReferencesList.clear(); ctx_model_storage_defined = false; } de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx, bool* continueDecoding) { *continueDecoding = false; reset(); uint32_t uvlc; int32_t svlc; // set defaults dependent_slice_segment_flag = 0; // read bitstream first_slice_segment_in_pic_flag = br->get_bits(1); if (ctx->get_RapPicFlag()) { // TODO: is this still correct ? Should we drop RapPicFlag ? no_output_of_prior_pics_flag = br->get_bits(1); } if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc >= DE265_MAX_PPS_SETS) { ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); return DE265_OK; } slice_pic_parameter_set_id = uvlc; if (!ctx->has_pps(slice_pic_parameter_set_id)) { ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); return DE265_OK; } pps = ctx->get_shared_pps(slice_pic_parameter_set_id); const seq_parameter_set* sps = pps->sps.get(); if (!sps->sps_read) { ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); *continueDecoding = false; return DE265_OK; } if (!first_slice_segment_in_pic_flag) { if (pps->dependent_slice_segments_enabled_flag) { dependent_slice_segment_flag = br->get_bits(1); } else { dependent_slice_segment_flag = 0; } uint32_t slice_segment_address = br->get_bits(ceil_log2(sps->PicSizeInCtbsY)); if (dependent_slice_segment_flag) { if (slice_segment_address == 0) { *continueDecoding = false; ctx->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false); return DE265_OK; } if (ctx->previous_slice_header == nullptr) { return DE265_ERROR_NO_INITIAL_SLICE_HEADER; } *this = *ctx->previous_slice_header; first_slice_segment_in_pic_flag = 0; dependent_slice_segment_flag = 1; } this->slice_segment_address = slice_segment_address; } else { dependent_slice_segment_flag = 0; slice_segment_address = 0; } if (slice_segment_address >= sps->PicSizeInCtbsY) { ctx->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } //printf("SLICE %d (%d)\n",slice_segment_address, sps->PicSizeInCtbsY); if (!dependent_slice_segment_flag) { for (int i = 0; i < pps->num_extra_slice_header_bits; i++) { //slice_reserved_undetermined_flag[i] br->skip_bits(1); } if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 2) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); *continueDecoding = false; return DE265_OK; } slice_type = uvlc; if (pps->output_flag_present_flag) { pic_output_flag = br->get_bits(1); } else { pic_output_flag = 1; } if (sps->separate_colour_plane_flag == 1) { colour_plane_id = br->get_bits(2); } slice_pic_order_cnt_lsb = 0; short_term_ref_pic_set_sps_flag = 0; int NumLtPics = 0; if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL && ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) { slice_pic_order_cnt_lsb = br->get_bits(sps->log2_max_pic_order_cnt_lsb); short_term_ref_pic_set_sps_flag = br->get_bits(1); if (!short_term_ref_pic_set_sps_flag) { read_short_term_ref_pic_set(ctx, sps, br, &slice_ref_pic_set, sps->num_short_term_ref_pic_sets(), sps->ref_pic_sets, true); CurrRpsIdx = sps->num_short_term_ref_pic_sets(); CurrRps = slice_ref_pic_set; } else { int nBits = ceil_log2(sps->num_short_term_ref_pic_sets()); if (nBits > 0) short_term_ref_pic_set_idx = br->get_bits(nBits); else short_term_ref_pic_set_idx = 0; if (short_term_ref_pic_set_idx >= sps->num_short_term_ref_pic_sets()) { ctx->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } CurrRpsIdx = short_term_ref_pic_set_idx; CurrRps = sps->ref_pic_sets[CurrRpsIdx]; } // --- long-term MC --- if (sps->long_term_ref_pics_present_flag) { if (sps->num_long_term_ref_pics_sps > 0) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > sps->num_long_term_ref_pics_sps) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } num_long_term_sps = uvlc; } else { num_long_term_sps = 0; } if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > MAX_NUM_LT_REF_PICS_SPS) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } num_long_term_pics = uvlc; // check maximum number of reference frames if (num_long_term_sps + num_long_term_pics + CurrRps.NumNegativePics + CurrRps.NumPositivePics > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers - 1]) { ctx->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); *continueDecoding = false; return DE265_OK; } for (int i = 0; i < num_long_term_sps + num_long_term_pics; i++) { if (i < num_long_term_sps) { int nBits = ceil_log2(sps->num_long_term_ref_pics_sps); lt_idx_sps[i] = br->get_bits(nBits); // check that the referenced lt-reference really exists if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) { ctx->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false); *continueDecoding = false; return DE265_OK; } // delta_poc_msb_present_flag[i] = 0; // TODO ? ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps[i]]; ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps[i]]; } else { int nBits = sps->log2_max_pic_order_cnt_lsb; poc_lsb_lt[i] = br->get_bits(nBits); used_by_curr_pic_lt_flag[i] = br->get_bits(1); ctx->PocLsbLt[i] = poc_lsb_lt[i]; ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i]; } if (ctx->UsedByCurrPicLt[i]) { NumLtPics++; } delta_poc_msb_present_flag[i] = br->get_bits(1); if (delta_poc_msb_present_flag[i]) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } delta_poc_msb_cycle_lt[i] = uvlc; } else { delta_poc_msb_cycle_lt[i] = 0; } if (i == 0 || i == num_long_term_sps) { ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i]; } else { if (delta_poc_msb_cycle_lt[i] > UINT32_MAX - ctx->DeltaPocMsbCycleLt[i - 1]) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] + ctx->DeltaPocMsbCycleLt[i - 1]); } } } else { num_long_term_sps = 0; num_long_term_pics = 0; } if (sps->sps_temporal_mvp_enabled_flag) { slice_temporal_mvp_enabled_flag = br->get_bits(1); } else { slice_temporal_mvp_enabled_flag = 0; } } else { slice_pic_order_cnt_lsb = 0; num_long_term_sps = 0; num_long_term_pics = 0; } // --- SAO --- if (sps->sample_adaptive_offset_enabled_flag) { slice_sao_luma_flag = br->get_bits(1); if (sps->ChromaArrayType != CHROMA_MONO) { slice_sao_chroma_flag = br->get_bits(1); } else { slice_sao_chroma_flag = 0; } } else { slice_sao_luma_flag = 0; slice_sao_chroma_flag = 0; } num_ref_idx_l0_active = 0; num_ref_idx_l1_active = 0; if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) { num_ref_idx_active_override_flag = br->get_bits(1); if (num_ref_idx_active_override_flag) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 15) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } num_ref_idx_l0_active = uvlc + 1; if (slice_type == SLICE_TYPE_B) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 15) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } num_ref_idx_l1_active = uvlc + 1; } } else { num_ref_idx_l0_active = pps->num_ref_idx_l0_default_active; num_ref_idx_l1_active = pps->num_ref_idx_l1_default_active; } NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics; if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { int nBits = ceil_log2(NumPocTotalCurr); ref_pic_list_modification_flag_l0 = br->get_bits(1); if (ref_pic_list_modification_flag_l0) { for (int i = 0; i < num_ref_idx_l0_active; i++) { list_entry_l0[i] = br->get_bits(nBits); } } if (slice_type == SLICE_TYPE_B) { ref_pic_list_modification_flag_l1 = br->get_bits(1); if (ref_pic_list_modification_flag_l1) { for (int i = 0; i < num_ref_idx_l1_active; i++) { list_entry_l1[i] = br->get_bits(nBits); } } } else { ref_pic_list_modification_flag_l1 = 0; } } else { ref_pic_list_modification_flag_l0 = 0; ref_pic_list_modification_flag_l1 = 0; } if (slice_type == SLICE_TYPE_B) { mvd_l1_zero_flag = br->get_bits(1); } if (pps->cabac_init_present_flag) { cabac_init_flag = br->get_bits(1); } else { cabac_init_flag = 0; } if (slice_temporal_mvp_enabled_flag) { if (slice_type == SLICE_TYPE_B) collocated_from_l0_flag = br->get_bits(1); else collocated_from_l0_flag = 1; if ((collocated_from_l0_flag && num_ref_idx_l0_active > 1) || (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 15) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } collocated_ref_idx = uvlc; } else { collocated_ref_idx = 0; } // check whether collocated_ref_idx points to a valid index if ((collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l0_active) || (!collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l1_active)) { ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } } if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { de265_error err = read_pred_weight_table(br, this, ctx); if (err) { ctx->add_warning(err, false); return err; } } if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 5) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } five_minus_max_num_merge_cand = uvlc; MaxNumMergeCand = 5 - five_minus_max_num_merge_cand; } if ((svlc = br->get_svlc()) == SVLC_ERROR) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } slice_qp_delta = svlc; //logtrace(LogSlice,"slice_qp_delta: %d\n",shdr->slice_qp_delta); if (pps->pps_slice_chroma_qp_offsets_present_flag) { if ((svlc = br->get_svlc()) == SVLC_ERROR) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } slice_cb_qp_offset = svlc; if ((svlc = br->get_svlc()) == SVLC_ERROR) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } slice_cr_qp_offset = svlc; } else { slice_cb_qp_offset = 0; slice_cr_qp_offset = 0; } if (pps->range_extension.chroma_qp_offset_list_enabled_flag) { cu_chroma_qp_offset_enabled_flag = br->get_bits(1); } if (pps->deblocking_filter_override_enabled_flag) { deblocking_filter_override_flag = br->get_bits(1); } else { deblocking_filter_override_flag = 0; } slice_beta_offset = pps->beta_offset; slice_tc_offset = pps->tc_offset; if (deblocking_filter_override_flag) { slice_deblocking_filter_disabled_flag = br->get_bits(1); if (!slice_deblocking_filter_disabled_flag) { // slice_beta_offset_div2 shall be in [-6, 6] (Sec. 7.4.7.1) if ((svlc = br->get_svlc()) == SVLC_ERROR || svlc < -6 || svlc > 6) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } slice_beta_offset = svlc * 2; // slice_tc_offset_div2 shall be in [-6, 6] (Sec. 7.4.7.1) if ((svlc = br->get_svlc()) == SVLC_ERROR || svlc < -6 || svlc > 6) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } slice_tc_offset = svlc * 2; } } else { slice_deblocking_filter_disabled_flag = pps->pic_disable_deblocking_filter_flag; } if (pps->pps_loop_filter_across_slices_enabled_flag && (slice_sao_luma_flag || slice_sao_chroma_flag || !slice_deblocking_filter_disabled_flag)) { slice_loop_filter_across_slices_enabled_flag = br->get_bits(1); } else { slice_loop_filter_across_slices_enabled_flag = pps->pps_loop_filter_across_slices_enabled_flag; } } if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) { // compute the spec limit for num_entry_point_offsets int maxEntryPointOffsets; if (!pps->tiles_enabled_flag && pps->entropy_coding_sync_enabled_flag) { maxEntryPointOffsets = sps->PicHeightInCtbsY - 1; } else if (pps->tiles_enabled_flag && !pps->entropy_coding_sync_enabled_flag) { maxEntryPointOffsets = pps->num_tile_columns * pps->num_tile_rows - 1; } else { maxEntryPointOffsets = pps->num_tile_columns * sps->PicHeightInCtbsY - 1; } if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > static_cast(maxEntryPointOffsets)) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } num_entry_point_offsets = uvlc; entry_point_offset.resize(num_entry_point_offsets); if (num_entry_point_offsets > 0) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 31) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } offset_len = uvlc + 1; for (int i = 0; i < num_entry_point_offsets; i++) { { uint32_t offset_minus1 = br->get_bits(offset_len); if (offset_minus1 == UINT32_MAX) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } entry_point_offset[i] = offset_minus1 + 1; } if (i > 0) { if (entry_point_offset[i] > UINT32_MAX - entry_point_offset[i - 1]) { ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } entry_point_offset[i] += entry_point_offset[i - 1]; } } } } else { num_entry_point_offsets = 0; } if (pps->slice_segment_header_extension_present_flag) { if ((uvlc = br->get_uvlc()) == UVLC_ERROR || uvlc > 1000) { // TODO: safety check against too large values ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } slice_segment_header_extension_length = uvlc; for (int i = 0; i < slice_segment_header_extension_length; i++) { //slice_segment_header_extension_data_byte[i] br->get_bits(8); } } compute_derived_values(pps.get()); // SliceQpY shall be in [-QpBdOffsetY, 51] (Sec. 7.4.7.1) if (SliceQPY < -sps->QpBdOffset_Y || SliceQPY > 51) { ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } *continueDecoding = true; return DE265_OK; } de265_error slice_segment_header::write(error_queue* errqueue, CABAC_encoder& out, const seq_parameter_set* sps, const pic_parameter_set* pps, uint8_t nal_unit_type) { out.write_bit(first_slice_segment_in_pic_flag); if (isRapPic(nal_unit_type)) { // TODO: is this still correct ? Should we drop RapPicFlag ? out.write_bit(no_output_of_prior_pics_flag); } if (slice_pic_parameter_set_id > DE265_MAX_PPS_SETS) { errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); return DE265_OK; } out.write_uvlc(slice_pic_parameter_set_id); if (!first_slice_segment_in_pic_flag) { if (pps->dependent_slice_segments_enabled_flag) { out.write_bit(dependent_slice_segment_flag); } out.write_bits(slice_segment_address, ceil_log2(sps->PicSizeInCtbsY)); if (dependent_slice_segment_flag) { if (slice_segment_address == 0) { errqueue->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false); return DE265_OK; } } } if (slice_segment_address > sps->PicSizeInCtbsY) { errqueue->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (!dependent_slice_segment_flag) { for (int i = 0; i < pps->num_extra_slice_header_bits; i++) { //slice_reserved_undetermined_flag[i] out.skip_bits(1); } if (slice_type > 2) { errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_OK; } out.write_uvlc(slice_type); if (pps->output_flag_present_flag) { out.write_bit(pic_output_flag); } if (sps->separate_colour_plane_flag == 1) { out.write_bits(colour_plane_id, 2); } int NumLtPics = 0; if (nal_unit_type != NAL_UNIT_IDR_W_RADL && nal_unit_type != NAL_UNIT_IDR_N_LP) { out.write_bits(slice_pic_order_cnt_lsb, sps->log2_max_pic_order_cnt_lsb); out.write_bit(short_term_ref_pic_set_sps_flag); if (!short_term_ref_pic_set_sps_flag) { /* TODO read_short_term_ref_pic_set(ctx, sps, br, &slice_ref_pic_set, sps->num_short_term_ref_pic_sets, sps->ref_pic_sets, true); */ //CurrRpsIdx = sps->num_short_term_ref_pic_sets; //CurrRps = slice_ref_pic_set; } else { int nBits = ceil_log2(sps->num_short_term_ref_pic_sets()); if (nBits > 0) out.write_bits(short_term_ref_pic_set_idx, nBits); else { assert(short_term_ref_pic_set_idx==0); } if (short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets()) { errqueue->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } //CurrRpsIdx = short_term_ref_pic_set_idx; //CurrRps = sps->ref_pic_sets[CurrRpsIdx]; } // --- long-term MC --- if (sps->long_term_ref_pics_present_flag) { if (sps->num_long_term_ref_pics_sps > 0) { out.write_uvlc(num_long_term_sps); } else { assert(num_long_term_sps == 0); } out.write_uvlc(num_long_term_pics); // check maximum number of reference frames if (num_long_term_sps + num_long_term_pics + CurrRps.NumNegativePics + CurrRps.NumPositivePics > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers - 1]) { errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); return DE265_OK; } for (int i = 0; i < num_long_term_sps + num_long_term_pics; i++) { if (i < num_long_term_sps) { int nBits = ceil_log2(sps->num_long_term_ref_pics_sps); out.write_bits(lt_idx_sps[i], nBits); // check that the referenced lt-reference really exists if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) { errqueue->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false); return DE265_OK; } //ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ]; //ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ]; } else { int nBits = sps->log2_max_pic_order_cnt_lsb; out.write_bits(poc_lsb_lt[i], nBits); out.write_bit(used_by_curr_pic_lt_flag[i]); //ctx->PocLsbLt[i] = poc_lsb_lt[i]; //ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i]; } //if (ctx->UsedByCurrPicLt[i]) { //NumLtPics++; //} out.write_bit(delta_poc_msb_present_flag[i]); if (delta_poc_msb_present_flag[i]) { out.write_uvlc(delta_poc_msb_cycle_lt[i]); } else { assert(delta_poc_msb_cycle_lt[i] == 0); } /* if (i==0 || i==num_long_term_sps) { ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i]; } else { ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] + ctx->DeltaPocMsbCycleLt[i-1]); } */ } } else { assert(num_long_term_sps == 0); assert(num_long_term_pics== 0); } if (sps->sps_temporal_mvp_enabled_flag) { out.write_bit(slice_temporal_mvp_enabled_flag); } else { assert(slice_temporal_mvp_enabled_flag == 0); } } else { assert(slice_pic_order_cnt_lsb == 0); assert(num_long_term_sps == 0); assert(num_long_term_pics== 0); } // --- SAO --- if (sps->sample_adaptive_offset_enabled_flag) { out.write_bit(slice_sao_luma_flag); out.write_bit(slice_sao_chroma_flag); } else { assert(slice_sao_luma_flag == 0); assert(slice_sao_chroma_flag== 0); } if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) { out.write_bit(num_ref_idx_active_override_flag); if (num_ref_idx_active_override_flag) { out.write_uvlc(num_ref_idx_l0_active); num_ref_idx_l0_active++;; if (slice_type == SLICE_TYPE_B) { out.write_uvlc(num_ref_idx_l1_active); num_ref_idx_l1_active++; } } else { assert(num_ref_idx_l0_active == pps->num_ref_idx_l0_default_active); assert(num_ref_idx_l1_active == pps->num_ref_idx_l1_default_active); } NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics; if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { int nBits = ceil_log2(NumPocTotalCurr); out.write_bit(ref_pic_list_modification_flag_l0); if (ref_pic_list_modification_flag_l0) { for (int i = 0; i < num_ref_idx_l0_active; i++) { out.write_bits(list_entry_l0[i], nBits); } } if (slice_type == SLICE_TYPE_B) { out.write_bit(ref_pic_list_modification_flag_l1); if (ref_pic_list_modification_flag_l1) { for (int i = 0; i < num_ref_idx_l1_active; i++) { out.write_bits(list_entry_l1[i], nBits); } } } else { assert(ref_pic_list_modification_flag_l1 == 0); } } else { assert(ref_pic_list_modification_flag_l0 == 0); assert(ref_pic_list_modification_flag_l1 == 0); } if (slice_type == SLICE_TYPE_B) { out.write_bit(mvd_l1_zero_flag); } if (pps->cabac_init_present_flag) { out.write_bit(cabac_init_flag); } else { assert(cabac_init_flag == 0); } if (slice_temporal_mvp_enabled_flag) { if (slice_type == SLICE_TYPE_B) out.write_bit(collocated_from_l0_flag); else { assert(collocated_from_l0_flag == 1); } if ((collocated_from_l0_flag && num_ref_idx_l0_active > 1) || (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) { out.write_uvlc(collocated_ref_idx); } else { assert(collocated_ref_idx == 0); } } if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { assert(0); /* TODO if (read_pred_weight_table(br,this,ctx) != DE265_OK) { ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } */ } out.write_uvlc(five_minus_max_num_merge_cand); //MaxNumMergeCand = 5-five_minus_max_num_merge_cand; } out.write_svlc(slice_qp_delta); if (pps->pps_slice_chroma_qp_offsets_present_flag) { out.write_svlc(slice_cb_qp_offset); out.write_svlc(slice_cr_qp_offset); } else { assert(slice_cb_qp_offset == 0); assert(slice_cr_qp_offset == 0); } if (pps->deblocking_filter_override_enabled_flag) { out.write_bit(deblocking_filter_override_flag); } else { assert(deblocking_filter_override_flag == 0); } //slice_beta_offset = pps->beta_offset; //slice_tc_offset = pps->tc_offset; if (deblocking_filter_override_flag) { out.write_bit(slice_deblocking_filter_disabled_flag); if (!slice_deblocking_filter_disabled_flag) { out.write_svlc(slice_beta_offset / 2); out.write_svlc(slice_tc_offset / 2); } } else { assert(slice_deblocking_filter_disabled_flag == pps->pic_disable_deblocking_filter_flag); } if (pps->pps_loop_filter_across_slices_enabled_flag && (slice_sao_luma_flag || slice_sao_chroma_flag || !slice_deblocking_filter_disabled_flag)) { out.write_bit(slice_loop_filter_across_slices_enabled_flag); } else { assert(slice_loop_filter_across_slices_enabled_flag == pps->pps_loop_filter_across_slices_enabled_flag); } } if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) { out.write_uvlc(num_entry_point_offsets); if (num_entry_point_offsets > 0) { out.write_uvlc(offset_len - 1); for (int i = 0; i < num_entry_point_offsets; i++) { { uint32_t prev = 0; if (i > 0) prev = entry_point_offset[i - 1]; out.write_bits(entry_point_offset[i] - prev - 1, offset_len); } } } } else { assert(num_entry_point_offsets == 0); } if (pps->slice_segment_header_extension_present_flag) { out.write_uvlc(slice_segment_header_extension_length); if (slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } for (int i = 0; i < slice_segment_header_extension_length; i++) { //slice_segment_header_extension_data_byte[i] out.skip_bits(8); } } return DE265_OK; } void slice_segment_header::compute_derived_values(const pic_parameter_set* pps) { // --- init variables --- SliceQPY = pps->pic_init_qp + slice_qp_delta; switch (slice_type) { case SLICE_TYPE_I: initType = 0; break; case SLICE_TYPE_P: initType = cabac_init_flag + 1; break; case SLICE_TYPE_B: initType = 2 - cabac_init_flag; break; } MaxNumMergeCand = 5 - five_minus_max_num_merge_cand; } //----------------------------------------------------------------------- void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx, int fd) const { FILE* fh; if (fd == 1) fh = stdout; else if (fd == 2) fh = stderr; else { return; } #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) #define LOG4(t,d1,d2,d3,d4) log2fh(fh, t,d1,d2,d3,d4) LOG0("----------------- SLICE -----------------\n"); const pic_parameter_set* pps = ctx->get_pps(slice_pic_parameter_set_id); if (!pps) { LOG0("invalid PPS referenced\n"); return; } assert(pps->pps_read); // TODO: error handling const seq_parameter_set* sps = ctx->get_sps((int) pps->seq_parameter_set_id); if (!sps) { LOG0("invalid SPS referenced\n"); return; } assert(sps->sps_read); // TODO: error handling LOG1("first_slice_segment_in_pic_flag : %d\n", first_slice_segment_in_pic_flag); if (ctx->get_nal_unit_type() >= NAL_UNIT_BLA_W_LP && ctx->get_nal_unit_type() <= NAL_UNIT_RESERVED_IRAP_VCL23) { LOG1("no_output_of_prior_pics_flag : %d\n", no_output_of_prior_pics_flag); } LOG1("slice_pic_parameter_set_id : %d\n", slice_pic_parameter_set_id); if (!first_slice_segment_in_pic_flag) { //if (pps->dependent_slice_segments_enabled_flag) { LOG1("dependent_slice_segment_flag : %d\n", dependent_slice_segment_flag); //} LOG1("slice_segment_address : %d\n", slice_segment_address); } //if (!dependent_slice_segment_flag) { //for (int i=0; inum_extra_slice_header_bits; i++) { //slice_reserved_flag[i] LOG1("slice_type : %c\n", slice_type == 0 ? 'B' : slice_type == 1 ? 'P' : 'I'); if (pps->output_flag_present_flag) { LOG1("pic_output_flag : %d\n", pic_output_flag); } if (sps->separate_colour_plane_flag == 1) { LOG1("colour_plane_id : %d\n", colour_plane_id); } LOG1("slice_pic_order_cnt_lsb : %d\n", slice_pic_order_cnt_lsb); if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL && ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) { LOG1("short_term_ref_pic_set_sps_flag : %d\n", short_term_ref_pic_set_sps_flag); if (!short_term_ref_pic_set_sps_flag) { LOG1("ref_pic_set[ %2d ]: ", sps->num_short_term_ref_pic_sets()); dump_compact_short_term_ref_pic_set(&slice_ref_pic_set, 16, fh); } else if (sps->num_short_term_ref_pic_sets() > 1) { LOG1("short_term_ref_pic_set_idx : %d\n", short_term_ref_pic_set_idx); dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[short_term_ref_pic_set_idx], 16, fh); } if (sps->long_term_ref_pics_present_flag) { if (sps->num_long_term_ref_pics_sps > 0) { LOG1("num_long_term_sps : %d\n", num_long_term_sps); } LOG1("num_long_term_pics : %d\n", num_long_term_pics); #if 0 for (int i = 0; i < num_long_term_sps + num_long_term_pics; i++) { LOG2("PocLsbLt[%d] : %d\n", i, ctx->PocLsbLt[i]); LOG2("UsedByCurrPicLt[%d] : %d\n", i, ctx->UsedByCurrPicLt[i]); LOG2("DeltaPocMsbCycleLt[%d] : %d\n", i, ctx->DeltaPocMsbCycleLt[i]); } #endif } if (sps->sps_temporal_mvp_enabled_flag) { LOG1("slice_temporal_mvp_enabled_flag : %d\n", slice_temporal_mvp_enabled_flag); } } if (sps->sample_adaptive_offset_enabled_flag) { LOG1("slice_sao_luma_flag : %d\n", slice_sao_luma_flag); LOG1("slice_sao_chroma_flag : %d\n", slice_sao_chroma_flag); } if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) { LOG1("num_ref_idx_active_override_flag : %d\n", num_ref_idx_active_override_flag); LOG2("num_ref_idx_l0_active : %d %s\n", num_ref_idx_l0_active, num_ref_idx_active_override_flag ? "" : "(from PPS)"); if (slice_type == SLICE_TYPE_B) { LOG2("num_ref_idx_l1_active : %d %s\n", num_ref_idx_l1_active, num_ref_idx_active_override_flag ? "" : "(from PPS)"); } if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { LOG1("ref_pic_list_modification_flag_l0 : %d\n", ref_pic_list_modification_flag_l0); if (ref_pic_list_modification_flag_l0) { for (int i = 0; i < num_ref_idx_l0_active; i++) { LOG2(" %d: %d\n", i, list_entry_l0[i]); } } LOG1("ref_pic_list_modification_flag_l1 : %d\n", ref_pic_list_modification_flag_l1); if (ref_pic_list_modification_flag_l1) { for (int i = 0; i < num_ref_idx_l1_active; i++) { LOG2(" %d: %d\n", i, list_entry_l1[i]); } } } if (slice_type == SLICE_TYPE_B) { LOG1("mvd_l1_zero_flag : %d\n", mvd_l1_zero_flag); } LOG1("cabac_init_flag : %d\n", cabac_init_flag); if (slice_temporal_mvp_enabled_flag) { LOG1("collocated_from_l0_flag : %d\n", collocated_from_l0_flag); LOG1("collocated_ref_idx : %d\n", collocated_ref_idx); } if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { LOG1("luma_log2_weight_denom : %d\n", luma_log2_weight_denom); if (sps->chroma_format_idc != 0) { LOG1("ChromaLog2WeightDenom : %d\n", ChromaLog2WeightDenom); } for (int l = 0; l <= 1; l++) if (l == 0 || (l == 1 && slice_type == SLICE_TYPE_B)) { int num_ref = (l == 0 ? num_ref_idx_l0_active - 1 : num_ref_idx_l1_active - 1); if (false) { // do not show these flags for (int i = 0; i <= num_ref; i++) { LOG3("luma_weight_flag_l%d[%d] : %d\n", l, i, luma_weight_flag[l][i]); } if (sps->chroma_format_idc != 0) { for (int i = 0; i <= num_ref; i++) { LOG3("chroma_weight_flag_l%d[%d] : %d\n", l, i, chroma_weight_flag[l][i]); } } } for (int i = 0; i <= num_ref; i++) { LOG3("LumaWeight_L%d[%d] : %d\n", l, i, LumaWeight[l][i]); LOG3("luma_offset_l%d[%d] : %d\n", l, i, luma_offset[l][i]); for (int j = 0; j < 2; j++) { LOG4("ChromaWeight_L%d[%d][%d] : %d\n", l, i, j, ChromaWeight[l][i][j]); LOG4("ChromaOffset_L%d[%d][%d] : %d\n", l, i, j, ChromaOffset[l][i][j]); } } } } LOG1("five_minus_max_num_merge_cand : %d\n", five_minus_max_num_merge_cand); } LOG1("slice_qp_delta : %d\n", slice_qp_delta); if (pps->pps_slice_chroma_qp_offsets_present_flag) { LOG1("slice_cb_qp_offset : %d\n", slice_cb_qp_offset); LOG1("slice_cr_qp_offset : %d\n", slice_cr_qp_offset); } if (pps->deblocking_filter_override_enabled_flag) { LOG1("deblocking_filter_override_flag : %d\n", deblocking_filter_override_flag); } LOG2("slice_deblocking_filter_disabled_flag : %d %s\n", slice_deblocking_filter_disabled_flag, (deblocking_filter_override_flag ? "(override)" : "(from pps)")); if (deblocking_filter_override_flag) { if (!slice_deblocking_filter_disabled_flag) { LOG1("slice_beta_offset : %d\n", slice_beta_offset); LOG1("slice_tc_offset : %d\n", slice_tc_offset); } } if (pps->pps_loop_filter_across_slices_enabled_flag && (slice_sao_luma_flag || slice_sao_chroma_flag || !slice_deblocking_filter_disabled_flag)) { LOG1("slice_loop_filter_across_slices_enabled_flag : %d\n", slice_loop_filter_across_slices_enabled_flag); } } if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) { LOG1("num_entry_point_offsets : %d\n", num_entry_point_offsets); if (num_entry_point_offsets > 0) { LOG1("offset_len : %d\n", offset_len); for (int i = 0; i < num_entry_point_offsets; i++) { LOG2("entry point [%i] : %d\n", i, entry_point_offset[i]); } } } /* if( slice_segment_header_extension_present_flag ) { slice_segment_header_extension_length for( i = 0; i < slice_segment_header_extension_length; i++) slice_segment_header_extension_data_byte[i] } byte_alignment() } */ #undef LOG0 #undef LOG1 #undef LOG2 #undef LOG3 #undef LOG4 //#endif } void initialize_CABAC_models(thread_context* tctx) { const int QPY = tctx->shdr->SliceQPY; const int initType = tctx->shdr->initType; assert(initType >= 0 && initType <= 2); tctx->ctx_model.init(initType, QPY); for (int i = 0; i < 4; i++) { tctx->StatCoeff[i] = 0; } } static int decode_transform_skip_flag(thread_context* tctx, int cIdx) { const int context = (cIdx == 0) ? 0 : 1; logtrace(LogSlice, "# transform_skip_flag (context=%d)\n", context); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_TRANSFORM_SKIP_FLAG + context]); logtrace(LogSymbols, "$1 transform_skip_flag=%d\n", bit); return bit; } static int decode_sao_merge_flag(thread_context* tctx) { logtrace(LogSlice, "# sao_merge_left/up_flag\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_SAO_MERGE_FLAG]); logtrace(LogSymbols, "$1 sao_merge_flag=%d\n", bit); return bit; } static uint8_t decode_sao_type_idx(thread_context* tctx) { logtrace(LogSlice, "# sao_type_idx_luma/chroma\n"); int bit0 = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_SAO_TYPE_IDX]); if (bit0 == 0) { logtrace(LogSymbols, "$1 sao_type_idx=%d\n", 0); return 0; } else { int bit1 = tctx->cabac_decoder.decode_bypass(); if (bit1 == 0) { logtrace(LogSymbols, "$1 sao_type_idx=%d\n", 1); return 1; } else { logtrace(LogSymbols, "$1 sao_type_idx=%d\n", 2); return 2; } } } static uint8_t decode_sao_offset_abs(thread_context* tctx, int bitDepth) { logtrace(LogSlice, "# sao_offset_abs\n"); int cMax = (1 << (libde265_min(bitDepth, 10) - 5)) - 1; assert(cMax >= 7 && cMax<=31); uint8_t value = static_cast(tctx->cabac_decoder.decode_TU_bypass( cMax)); logtrace(LogSymbols, "$1 sao_offset_abs=%d\n", value); return value; } static int decode_sao_class(thread_context* tctx) { logtrace(LogSlice, "# sao_class\n"); int value = tctx->cabac_decoder.decode_FL_bypass( 2); logtrace(LogSymbols, "$1 sao_class=%d\n", value); return value; } static int decode_sao_offset_sign(thread_context* tctx) { logtrace(LogSlice, "# sao_offset_sign\n"); int value = tctx->cabac_decoder.decode_bypass(); logtrace(LogSymbols, "$1 sao_offset_sign=%d\n", value); return value; } static int decode_sao_band_position(thread_context* tctx) { logtrace(LogSlice, "# sao_band_position\n"); int value = tctx->cabac_decoder.decode_FL_bypass( 5); logtrace(LogSymbols, "$1 sao_band_position=%d\n", value); return value; } static int decode_transquant_bypass_flag(thread_context* tctx) { logtrace(LogSlice, "# cu_transquant_bypass_enable_flag\n"); int value = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG]); logtrace(LogSymbols, "$1 transquant_bypass_flag=%d\n", value); return value; } #include #include static int decode_split_cu_flag(thread_context* tctx, int x0, int y0, int ctDepth) { // check if neighbors are available int availableL = check_CTB_available(tctx->img, x0, y0, x0 - 1, y0); int availableA = check_CTB_available(tctx->img, x0, y0, x0, y0 - 1); int condL = 0; int condA = 0; if (availableL && tctx->img->get_ctDepth(x0 - 1, y0) > ctDepth) condL = 1; if (availableA && tctx->img->get_ctDepth(x0, y0 - 1) > ctDepth) condA = 1; int contextOffset = condL + condA; int context = contextOffset; // decode bit logtrace(LogSlice, "# split_cu_flag context=%d R=%x\n", context, tctx->cabac_decoder.range); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_SPLIT_CU_FLAG + context]); logtrace(LogSlice, "> split_cu_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range, context, bit); logtrace(LogSymbols, "$1 split_cu_flag=%d\n", bit); return bit; } static int decode_cu_skip_flag(thread_context* tctx, int x0, int y0, int ctDepth) { // check if neighbors are available int availableL = check_CTB_available(tctx->img, x0, y0, x0 - 1, y0); int availableA = check_CTB_available(tctx->img, x0, y0, x0, y0 - 1); int condL = 0; int condA = 0; if (availableL && tctx->img->get_cu_skip_flag(x0 - 1, y0)) condL = 1; if (availableA && tctx->img->get_cu_skip_flag(x0, y0 - 1)) condA = 1; int contextOffset = condL + condA; int context = contextOffset; // decode bit logtrace(LogSlice, "# cu_skip_flag context=%d R=%x\n", context, tctx->cabac_decoder.range); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CU_SKIP_FLAG + context]); logtrace(LogSlice, "> cu_skip_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range, context, bit); logtrace(LogSymbols, "$1 cu_skip_flag=%d\n", bit); return bit; } static enum PartMode decode_part_mode(thread_context* tctx, enum PredMode pred_mode, int cLog2CbSize) { de265_image* img = tctx->img; if (pred_mode == MODE_INTRA) { logtrace(LogSlice, "# part_mode (INTRA)\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PART_MODE]); logtrace(LogSlice, "> %s\n", bit ? "2Nx2N" : "NxN"); logtrace(LogSymbols, "$1 part_mode=%d\n", bit ? PART_2Nx2N : PART_NxN); return bit ? PART_2Nx2N : PART_NxN; } else { const seq_parameter_set& sps = img->get_sps(); int bit0 = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PART_MODE + 0]); if (bit0) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_2Nx2N); return PART_2Nx2N; } // CHECK_ME: I optimize code and fix bug here, need more VERIFY! int bit1 = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PART_MODE + 1]); if (cLog2CbSize > sps.Log2MinCbSizeY) { if (!sps.amp_enabled_flag) { logtrace(LogSymbols, "$1 part_mode=%d\n", bit1 ? PART_2NxN : PART_Nx2N); return bit1 ? PART_2NxN : PART_Nx2N; } else { int bit3 = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PART_MODE + 3]); if (bit3) { logtrace(LogSymbols, "$1 part_mode=%d\n", bit1 ? PART_2NxN : PART_Nx2N); return bit1 ? PART_2NxN : PART_Nx2N; } int bit4 = tctx->cabac_decoder.decode_bypass(); if (bit1 && bit4) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_2NxnD); return PART_2NxnD; } if (bit1 && !bit4) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_2NxnU); return PART_2NxnU; } if (!bit1 && !bit4) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_nLx2N); return PART_nLx2N; } if (!bit1 && bit4) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_nRx2N); return PART_nRx2N; } } } else { // TODO, we could save one if here when first decoding the next bin and then // checkcLog2CbSize==3 when it is '0' if (bit1) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_2NxN); return PART_2NxN; } if (cLog2CbSize == 3) { logtrace(LogSymbols, "$1 part_mode=%d\n", PART_Nx2N); return PART_Nx2N; } else { int bit2 = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PART_MODE + 2]); logtrace(LogSymbols, "$1 part_mode=%d\n", PART_NxN - bit2); return (enum PartMode) ((int) PART_NxN - bit2)/*bit2 ? PART_Nx2N : PART_NxN*/; } } } assert(false); // should never be reached return PART_2Nx2N; } static inline int decode_prev_intra_luma_pred_flag(thread_context* tctx) { logtrace(LogSlice, "# prev_intra_luma_pred_flag\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG]); logtrace(LogSymbols, "$1 prev_intra_luma_pred_flag=%d\n", bit); return bit; } static inline int decode_mpm_idx(thread_context* tctx) { logtrace(LogSlice, "# mpm_idx (TU:2)\n"); int mpm = tctx->cabac_decoder.decode_TU_bypass( 2); logtrace(LogSlice, "> mpm_idx = %d\n", mpm); logtrace(LogSymbols, "$1 mpm_idx=%d\n", mpm); return mpm; } static inline int decode_rem_intra_luma_pred_mode(thread_context* tctx) { logtrace(LogSlice, "# rem_intra_luma_pred_mode (5 bits)\n"); int value = tctx->cabac_decoder.decode_FL_bypass( 5); logtrace(LogSymbols, "$1 rem_intra_luma_pred_mode=%d\n", value); return value; } static int decode_intra_chroma_pred_mode(thread_context* tctx) { logtrace(LogSlice, "# intra_chroma_pred_mode\n"); int prefix = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE]); int mode; if (prefix == 0) { mode = 4; } else { mode = tctx->cabac_decoder.decode_FL_bypass( 2); } logtrace(LogSlice, "> intra_chroma_pred_mode = %d\n", mode); logtrace(LogSymbols, "$1 intra_chroma_pred_mode=%d\n", mode); return mode; } static int decode_split_transform_flag(thread_context* tctx, int log2TrafoSize) { logtrace(LogSlice, "# split_transform_flag (log2TrafoSize=%d)\n", log2TrafoSize); int context = 5 - log2TrafoSize; assert(context >= 0 && context <= 2); logtrace(LogSlice, "# context: %d\n", context); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context]); logtrace(LogSymbols, "$1 split_transform_flag=%d\n", bit); return bit; } static int decode_cbf_chroma(thread_context* tctx, int trafoDepth) { logtrace(LogSlice, "# cbf_chroma\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CBF_CHROMA + trafoDepth]); logtrace(LogSymbols, "$1 cbf_chroma=%d\n", bit); return bit; } static int decode_cbf_luma(thread_context* tctx, int trafoDepth) { logtrace(LogSlice, "# cbf_luma\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CBF_LUMA + (trafoDepth == 0)]); logtrace(LogSlice, "> cbf_luma = %d\n", bit); logtrace(LogSymbols, "$1 cbf_luma=%d\n", bit); return bit; } static inline int decode_coded_sub_block_flag(thread_context* tctx, int cIdx, uint8_t coded_sub_block_neighbors) { logtrace(LogSlice, "# coded_sub_block_flag\n"); // tricky computation of csbfCtx int csbfCtx = ((coded_sub_block_neighbors & 1) | // right neighbor set or (coded_sub_block_neighbors >> 1)); // bottom neighbor set -> csbfCtx=1 int ctxIdxInc = csbfCtx; if (cIdx != 0) { ctxIdxInc += 2; } int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc]); logtrace(LogSymbols, "$1 coded_sub_block_flag=%d\n", bit); return bit; } static const uint8_t CABAC_QP_DELTA_ABS_ERROR = 0xFF; static uint8_t decode_cu_qp_delta_abs(thread_context* tctx) { logtrace(LogSlice, "# cu_qp_delta_abs\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 0]); if (bit == 0) { logtrace(LogSymbols, "$1 cu_qp_delta_abs=%d\n", 0); return 0; } uint8_t prefix = 1; for (uint8_t i = 0; i < 4; i++) { bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 1]); if (bit == 0) { break; } else { prefix++; } } if (prefix == 5) { uint32_t value = tctx->cabac_decoder.decode_EGk_bypass( 0); if (value >= 250) { return CABAC_QP_DELTA_ABS_ERROR; } logtrace(LogSymbols, "$1 cu_qp_delta_abs=%d\n", value + 5); return value + 5; } else { logtrace(LogSymbols, "$1 cu_qp_delta_abs=%d\n", prefix); return prefix; } } static int decode_last_significant_coeff_prefix(thread_context* tctx, int log2TrafoSize, int cIdx, context_model* model) { logtrace(LogSlice, "# last_significant_coeff_prefix log2TrafoSize:%d cIdx:%d\n", log2TrafoSize, cIdx); int cMax = (log2TrafoSize << 1) - 1; int ctxOffset, ctxShift; if (cIdx == 0) { ctxOffset = 3 * (log2TrafoSize - 2) + ((log2TrafoSize - 1) >> 2); ctxShift = (log2TrafoSize + 1) >> 2; } else { ctxOffset = 15; ctxShift = log2TrafoSize - 2; } int binIdx; int value = cMax; for (binIdx = 0; binIdx < cMax; binIdx++) { int ctxIdxInc = (binIdx >> ctxShift); logtrace(LogSlice, "context: %d+%d\n", ctxOffset, ctxIdxInc); int bit = tctx->cabac_decoder.decode_bit( &model[ctxOffset + ctxIdxInc]); if (bit == 0) { value = binIdx; break; } } logtrace(LogSlice, "> last_significant_coeff_prefix: %d\n", value); return value; } static const uint8_t ctxIdxMap[16] = { 0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 99 }; uint8_t* ctxIdxLookup[4 /* 4-log2-32 */][2 /* !!cIdx */][2 /* !!scanIdx */][4 /* prevCsbf */]; bool alloc_and_init_significant_coeff_ctxIdx_lookupTable() { int tableSize = 4 * 4 * (2) + 8 * 8 * (2 * 2 * 4) + 16 * 16 * (2 * 4) + 32 * 32 * (2 * 4); uint8_t* p = (uint8_t*) malloc(tableSize); if (p == nullptr) { return false; } memset(p, 0xFF, tableSize); // just for debugging // --- Set pointers to memory areas. Note that some parameters share the same memory. --- // 4x4 for (int cIdx = 0; cIdx < 2; cIdx++) { for (int scanIdx = 0; scanIdx < 2; scanIdx++) for (int prevCsbf = 0; prevCsbf < 4; prevCsbf++) ctxIdxLookup[0][cIdx][scanIdx][prevCsbf] = p; p += 4 * 4; } // 8x8 for (int cIdx = 0; cIdx < 2; cIdx++) for (int scanIdx = 0; scanIdx < 2; scanIdx++) for (int prevCsbf = 0; prevCsbf < 4; prevCsbf++) { ctxIdxLookup[1][cIdx][scanIdx][prevCsbf] = p; p += 8 * 8; } // 16x16 for (int cIdx = 0; cIdx < 2; cIdx++) for (int prevCsbf = 0; prevCsbf < 4; prevCsbf++) { for (int scanIdx = 0; scanIdx < 2; scanIdx++) { ctxIdxLookup[2][cIdx][scanIdx][prevCsbf] = p; } p += 16 * 16; } // 32x32 for (int cIdx = 0; cIdx < 2; cIdx++) for (int prevCsbf = 0; prevCsbf < 4; prevCsbf++) { for (int scanIdx = 0; scanIdx < 2; scanIdx++) { ctxIdxLookup[3][cIdx][scanIdx][prevCsbf] = p; } p += 32 * 32; } // --- precompute ctxIdx tables --- for (int log2w = 2; log2w <= 5; log2w++) for (int cIdx = 0; cIdx < 2; cIdx++) for (int scanIdx = 0; scanIdx < 2; scanIdx++) for (int prevCsbf = 0; prevCsbf < 4; prevCsbf++) { for (int yC = 0; yC < (1 << log2w); yC++) for (int xC = 0; xC < (1 << log2w); xC++) { int w = 1 << log2w; int sbWidth = w >> 2; int sigCtx; // if log2TrafoSize==2 if (sbWidth == 1) { sigCtx = ctxIdxMap[(yC << 2) + xC]; } else if (xC + yC == 0) { sigCtx = 0; } else { int xS = xC >> 2; int yS = yC >> 2; /* int prevCsbf = 0; if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } */ int xP = xC & 3; int yP = yC & 3; //logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); //logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); switch (prevCsbf) { case 0: sigCtx = (xP + yP >= 3) ? 0 : (xP + yP > 0) ? 1 : 2; break; case 1: sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1 : 0; break; case 2: sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1 : 0; break; default: sigCtx = 2; break; } //logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); if (cIdx == 0) { if (xS + yS > 0) sigCtx += 3; //logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); // if log2TrafoSize==3 if (sbWidth == 2) { // 8x8 block sigCtx += (scanIdx == 0) ? 9 : 15; } else { sigCtx += 21; } //logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); } else { // if log2TrafoSize==3 if (sbWidth == 2) { // 8x8 block sigCtx += 9; } else { sigCtx += 12; } } } int ctxIdxInc; if (cIdx == 0) { ctxIdxInc = sigCtx; } else { ctxIdxInc = 27 + sigCtx; } if (ctxIdxLookup[log2w - 2][cIdx][scanIdx][prevCsbf][xC + (yC << log2w)] != 0xFF) { assert(ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<> 4]; int x0 = S.x << 2; int y0 = S.y << 2; int subX = ScanOrderPos[s & 0xF].x; int subY = ScanOrderPos[s & 0xF].y; int xC = x0 + subX; int yC = y0 + subY; int w = 1 << log2w; int sbWidth = w >> 2; int sigCtx; // if log2TrafoSize==2 if (sbWidth == 1) { sigCtx = ctxIdxMap[(yC << 2) + xC]; } else if (xC + yC == 0) { sigCtx = 0; } else { int xS = xC >> 2; int yS = yC >> 2; /* int prevCsbf = 0; if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } */ int xP = xC & 3; int yP = yC & 3; logtrace(LogSlice, "posInSubset: %d,%d\n", xP, yP); logtrace(LogSlice, "prevCsbf: %d\n", prevCsbf); //printf("%d | %d %d\n",prevCsbf,xP,yP); switch (prevCsbf) { case 0: //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0; sigCtx = (xP + yP >= 3) ? 0 : (xP + yP > 0) ? 1 : 2; break; case 1: sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1 : 0; break; case 2: sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1 : 0; break; default: sigCtx = 2; break; } logtrace(LogSlice, "a) sigCtx=%d\n", sigCtx); if (cIdx == 0) { if (xS + yS > 0) sigCtx += 3; logtrace(LogSlice, "b) sigCtx=%d\n", sigCtx); // if log2TrafoSize==3 if (sbWidth == 2) { // 8x8 block sigCtx += (scanIdx == 0) ? 9 : 15; } else { sigCtx += 21; } logtrace(LogSlice, "c) sigCtx=%d\n", sigCtx); } else { // if log2TrafoSize==3 if (sbWidth == 2) { // 8x8 block sigCtx += 9; } else { sigCtx += 12; } } } int ctxIdxInc; if (cIdx == 0) { ctxIdxInc = sigCtx; } else { ctxIdxInc = 27 + sigCtx; } ctxIdxLookup[log2w - 2][cIdx][scanIdx][prevCsbf][xC + (yC << log2w)] = ctxIdxInc; //NOTE: when using this option, we have to include all three scanIdx in the table //ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][s] = ctxIdxInc; } } return true; } void free_significant_coeff_ctxIdx_lookupTable() { free(ctxIdxLookup[0][0][0][0]); ctxIdxLookup[0][0][0][0] = nullptr; } #if 0 static int decode_significant_coeff_flag(thread_context* tctx, int xC, int yC, const uint8_t* coded_sub_block_flag, int sbWidth, int cIdx, int scanIdx) { logtrace(LogSlice, "# significant_coeff_flag (xC:%d yC:%d sbWidth:%d cIdx:%d scanIdx:%d)\n", xC, yC, sbWidth, cIdx, scanIdx); int sigCtx; // if log2TrafoSize==2 if (sbWidth == 1) { sigCtx = ctxIdxMap[(yC << 2) + xC]; } else if (xC + yC == 0) { sigCtx = 0; } else { int xS = xC >> 2; int yS = yC >> 2; int prevCsbf = 0; if (xS < sbWidth - 1) { prevCsbf += coded_sub_block_flag[xS + 1 + yS * sbWidth]; } if (yS < sbWidth - 1) { prevCsbf += coded_sub_block_flag[xS + (1 + yS) * sbWidth] << 1; } int xP = xC & 3; int yP = yC & 3; logtrace(LogSlice, "posInSubset: %d,%d\n", xP, yP); logtrace(LogSlice, "prevCsbf: %d\n", prevCsbf); //printf("%d | %d %d\n",prevCsbf,xP,yP); switch (prevCsbf) { case 0: //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0; sigCtx = (xP + yP >= 3) ? 0 : (xP + yP > 0) ? 1 : 2; break; case 1: sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1 : 0; break; case 2: sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1 : 0; break; default: sigCtx = 2; break; } logtrace(LogSlice, "a) sigCtx=%d\n", sigCtx); if (cIdx == 0) { if (xS + yS > 0) sigCtx += 3; logtrace(LogSlice, "b) sigCtx=%d\n", sigCtx); // if log2TrafoSize==3 if (sbWidth == 2) { sigCtx += (scanIdx == 0) ? 9 : 15; } else { sigCtx += 21; } logtrace(LogSlice, "c) sigCtx=%d\n", sigCtx); } else { // if log2TrafoSize==3 if (sbWidth == 2) { sigCtx += 9; } else { sigCtx += 12; } } } int ctxIdxInc; if (cIdx == 0) { ctxIdxInc = sigCtx; } else { ctxIdxInc = 27 + sigCtx; } int context = tctx->shdr->initType * 42 + ctxIdxInc; logtrace(LogSlice, "context: %d\n", context); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + context]); return bit; } #endif static inline int decode_significant_coeff_flag_lookup(thread_context* tctx, uint8_t ctxIdxInc) { logtrace(LogSlice, "# significant_coeff_flag\n"); logtrace(LogSlice, "context: %d\n", ctxIdxInc); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc]); logtrace(LogSymbols, "$1 significant_coeff_flag=%d\n", bit); return bit; } static inline int decode_coeff_abs_level_greater1(thread_context* tctx, int cIdx, int i, bool firstCoeffInSubblock, bool firstSubblock, int lastSubblock_greater1Ctx, int* lastInvocation_greater1Ctx, int* lastInvocation_coeff_abs_level_greater1_flag, int* lastInvocation_ctxSet, int c1) { logtrace(LogSlice, "# coeff_abs_level_greater1\n"); logtrace(LogSlice, " cIdx:%d i:%d firstCoeffInSB:%d firstSB:%d lastSB>1:%d last>1Ctx:%d lastLev>1:%d lastCtxSet:%d\n", cIdx, i, firstCoeffInSubblock, firstSubblock, lastSubblock_greater1Ctx, *lastInvocation_greater1Ctx, *lastInvocation_coeff_abs_level_greater1_flag, *lastInvocation_ctxSet); int lastGreater1Ctx; int greater1Ctx; int ctxSet; logtrace(LogSlice, "c1: %d\n", c1); if (firstCoeffInSubblock) { // block with real DC -> ctx 0 if (i == 0 || cIdx > 0) { ctxSet = 0; } else { ctxSet = 2; } if (firstSubblock) { lastGreater1Ctx = 1; } else { lastGreater1Ctx = lastSubblock_greater1Ctx; } if (lastGreater1Ctx == 0) { ctxSet++; } logtrace(LogSlice, "ctxSet: %d\n", ctxSet); greater1Ctx = 1; } else { // !firstCoeffInSubblock ctxSet = *lastInvocation_ctxSet; logtrace(LogSlice, "ctxSet (old): %d\n", ctxSet); greater1Ctx = *lastInvocation_greater1Ctx; if (greater1Ctx > 0) { int lastGreater1Flag = *lastInvocation_coeff_abs_level_greater1_flag; if (lastGreater1Flag == 1) greater1Ctx = 0; else { /*if (greater1Ctx>0)*/ greater1Ctx++; } } } ctxSet = c1; // use HM algo int ctxIdxInc = (ctxSet * 4) + (greater1Ctx >= 3 ? 3 : greater1Ctx); if (cIdx > 0) { ctxIdxInc += 16; } int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + ctxIdxInc]); *lastInvocation_greater1Ctx = greater1Ctx; *lastInvocation_coeff_abs_level_greater1_flag = bit; *lastInvocation_ctxSet = ctxSet; //logtrace(LogSymbols,"$1 coeff_abs_level_greater1=%d\n",bit); return bit; } static int decode_coeff_abs_level_greater2(thread_context* tctx, int cIdx, // int i,int n, int ctxSet) { logtrace(LogSlice, "# coeff_abs_level_greater2\n"); int ctxIdxInc = ctxSet; if (cIdx > 0) ctxIdxInc += 4; int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc]); logtrace(LogSymbols, "$1 coeff_abs_level_greater2=%d\n", bit); return bit; } #define MAX_PREFIX (15+3) static int32_t decode_coeff_abs_level_remaining(thread_context* tctx, int cRiceParam) { logtrace(LogSlice, "# decode_coeff_abs_level_remaining\n"); uint16_t prefix = 0; while (tctx->cabac_decoder.decode_bypass()) { prefix++; if (prefix > MAX_PREFIX) { return 0; // TODO: error } } // prefix = nb. 1 bits int32_t value; if (prefix <= 3) { // when code only TR part (level < TRMax) int codeword = tctx->cabac_decoder.decode_FL_bypass( cRiceParam); value = (prefix << cRiceParam) + codeword; } else { // Suffix coded with EGk. Note that the unary part of EGk is already // included in the 'prefix' counter above. int codeword = tctx->cabac_decoder.decode_FL_bypass( prefix - 3 + cRiceParam); value = (((UINT16_C(1) << (prefix - 3)) + 3 - 1) << cRiceParam) + codeword; } logtrace(LogSymbols, "$1 coeff_abs_level_remaining=%d\n", value); return value; } static int decode_merge_flag(thread_context* tctx) { logtrace(LogSlice, "# merge_flag\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_MERGE_FLAG]); logtrace(LogSymbols, "$1 merge_flag=%d\n", bit); return bit; } static int decode_merge_idx(thread_context* tctx) { logtrace(LogSlice, "# merge_idx\n"); if (tctx->shdr->MaxNumMergeCand <= 1) { logtrace(LogSymbols, "$1 merge_idx=%d\n", 0); return 0; } // TU coding, first bin is CABAC, remaining are bypass. // cMax = MaxNumMergeCand-1 int idx = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_MERGE_IDX]); if (idx == 0) { // nothing } else { idx = 1; while (idx < tctx->shdr->MaxNumMergeCand - 1) { if (tctx->cabac_decoder.decode_bypass()) { idx++; } else { break; } } } logtrace(LogSlice, "> merge_idx = %d\n", idx); logtrace(LogSymbols, "$1 merge_idx=%d\n", idx); return idx; } static int decode_pred_mode_flag(thread_context* tctx) { logtrace(LogSlice, "# pred_mode_flag\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_PRED_MODE_FLAG]); logtrace(LogSymbols, "$1 pred_mode=%d\n", bit); return bit; } static int decode_mvp_lx_flag(thread_context* tctx) { logtrace(LogSlice, "# mvp_lx_flag\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_MVP_LX_FLAG]); logtrace(LogSymbols, "$1 mvp_lx_flag=%d\n", bit); return bit; } static int decode_rqt_root_cbf(thread_context* tctx) { logtrace(LogSlice, "# rqt_root_cbf\n"); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_RQT_ROOT_CBF]); logtrace(LogSymbols, "$1 rqt_root_cbf=%d\n", bit); return bit; } static int decode_ref_idx_lX(thread_context* tctx, int numRefIdxLXActive) { // prevent endless loop when 'numRefIdxLXActive' is invalid if (numRefIdxLXActive <= 1) { return 0; } logtrace(LogSlice, "# ref_idx_lX\n"); int cMax = numRefIdxLXActive - 1; if (cMax == 0) { logtrace(LogSlice, "> ref_idx = 0 (cMax==0)\n"); return 0; } // do check for single reference frame here int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 0]); int idx = 0; while (bit) { idx++; if (idx == cMax) { break; } if (idx == 1) { bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 1]); } else { bit = tctx->cabac_decoder.decode_bypass(); } } logtrace(LogSlice, "> ref_idx = %d\n", idx); logtrace(LogSymbols, "$1 ref_idx_lX=%d\n", idx); return idx; } static enum InterPredIdc decode_inter_pred_idc(thread_context* tctx, int x0, int y0, int nPbW, int nPbH, int ctDepth) { logtrace(LogSlice, "# inter_pred_idc\n"); int value; context_model* model = &tctx->ctx_model[CONTEXT_MODEL_INTER_PRED_IDC]; if (nPbW + nPbH == 12) { value = tctx->cabac_decoder.decode_bit( &model[4]); } else { int bit0 = tctx->cabac_decoder.decode_bit( &model[ctDepth]); if (bit0 == 0) { value = tctx->cabac_decoder.decode_bit( &model[4]); } else { value = 2; } } logtrace(LogSlice, "> inter_pred_idc = %d (%s)\n", value, value == 0 ? "L0" : (value == 1 ? "L1" : "BI")); logtrace(LogSymbols, "$1 decode_inter_pred_idx=%d\n", value + 1); return (enum InterPredIdc) (value + 1); } static int decode_explicit_rdpcm_flag(thread_context* tctx, int cIdx) { context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_FLAG]; int value = tctx->cabac_decoder.decode_bit( &model[cIdx ? 1 : 0]); return value; } static int decode_explicit_rdpcm_dir(thread_context* tctx, int cIdx) { context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_DIR]; int value = tctx->cabac_decoder.decode_bit( &model[cIdx ? 1 : 0]); return value; } /* Take CtbAddrInTS and compute -> CtbAddrInRS, CtbX, CtbY */ bool setCtbAddrFromTS(thread_context* tctx) { const seq_parameter_set& sps = tctx->img->get_sps(); if (tctx->CtbAddrInTS < sps.PicSizeInCtbsY) { tctx->CtbAddrInRS = tctx->img->get_pps().CtbAddrTStoRS[tctx->CtbAddrInTS]; tctx->CtbX = tctx->CtbAddrInRS % sps.PicWidthInCtbsY; tctx->CtbY = tctx->CtbAddrInRS / sps.PicWidthInCtbsY; return false; } else { tctx->CtbAddrInRS = sps.PicSizeInCtbsY; tctx->CtbX = tctx->CtbAddrInRS % sps.PicWidthInCtbsY; tctx->CtbY = tctx->CtbAddrInRS / sps.PicWidthInCtbsY; return true; } } // returns true when we reached the end of the image (ctbAddr==picSizeInCtbsY) bool advanceCtbAddr(thread_context* tctx) { tctx->CtbAddrInTS++; return setCtbAddrFromTS(tctx); } void read_sao(thread_context* tctx, int xCtb, int yCtb, int CtbAddrInSliceSeg) { slice_segment_header* shdr = tctx->shdr; de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); const pic_parameter_set& pps = img->get_pps(); logtrace(LogSlice, "# read_sao(%d,%d)\n", xCtb, yCtb); sao_info saoinfo; memset(&saoinfo, 0, sizeof(sao_info)); logtrace(LogSlice, "sizeof saoinfo: %d\n", sizeof(sao_info)); char sao_merge_left_flag = 0; char sao_merge_up_flag = 0; if (xCtb > 0) { //char leftCtbInSliceSeg = (CtbAddrInSliceSeg>0); char leftCtbInSliceSeg = (tctx->CtbAddrInRS > shdr->SliceAddrRS); char leftCtbInTile = (pps.TileIdRS[xCtb + yCtb * sps.PicWidthInCtbsY] == pps.TileIdRS[xCtb - 1 + yCtb * sps.PicWidthInCtbsY]); if (leftCtbInSliceSeg && leftCtbInTile) { sao_merge_left_flag = decode_sao_merge_flag(tctx); logtrace(LogSlice, "sao_merge_left_flag: %d\n", sao_merge_left_flag); } } if (yCtb > 0 && sao_merge_left_flag == 0) { logtrace(LogSlice, "CtbAddrInRS:%d PicWidthInCtbsY:%d slice_segment_address:%d\n", tctx->CtbAddrInRS, sps.PicWidthInCtbsY, shdr->slice_segment_address); bool upCtbInSliceSeg = (tctx->CtbAddrInRS - sps.PicWidthInCtbsY) >= shdr->SliceAddrRS; bool upCtbInTile = (pps.TileIdRS[xCtb + yCtb * sps.PicWidthInCtbsY] == pps.TileIdRS[xCtb + (yCtb - 1) * sps.PicWidthInCtbsY]); if (upCtbInSliceSeg && upCtbInTile) { sao_merge_up_flag = decode_sao_merge_flag(tctx); logtrace(LogSlice, "sao_merge_up_flag: %d\n", sao_merge_up_flag); } } if (!sao_merge_up_flag && !sao_merge_left_flag) { int nChroma = 3; if (sps.ChromaArrayType == CHROMA_MONO) nChroma = 1; for (int cIdx = 0; cIdx < nChroma; cIdx++) { if ((shdr->slice_sao_luma_flag && cIdx == 0) || (shdr->slice_sao_chroma_flag && cIdx > 0)) { uint8_t SaoTypeIdx = 0; if (cIdx == 0) { uint8_t sao_type_idx_luma = decode_sao_type_idx(tctx); logtrace(LogSlice, "sao_type_idx_luma: %d\n", sao_type_idx_luma); saoinfo.SaoTypeIdx = SaoTypeIdx = sao_type_idx_luma; } else if (cIdx == 1) { uint8_t sao_type_idx_chroma = decode_sao_type_idx(tctx); logtrace(LogSlice, "sao_type_idx_chroma: %d\n", sao_type_idx_chroma); SaoTypeIdx = sao_type_idx_chroma; saoinfo.SaoTypeIdx |= SaoTypeIdx << (2 * 1); saoinfo.SaoTypeIdx |= SaoTypeIdx << (2 * 2); // set for both chroma components } else { // SaoTypeIdx = 0 SaoTypeIdx = (saoinfo.SaoTypeIdx >> (2 * cIdx)) & 0x3; } if (SaoTypeIdx != 0) { for (int i = 0; i < 4; i++) { saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx, img->get_bit_depth(cIdx)); logtrace(LogSlice, "saoOffsetVal[%d][%d] = %d\n", cIdx, i, saoinfo.saoOffsetVal[cIdx][i]); } int sign[4]; if (SaoTypeIdx == 1) { for (int i = 0; i < 4; i++) { if (saoinfo.saoOffsetVal[cIdx][i] != 0) { sign[i] = decode_sao_offset_sign(tctx) ? -1 : 1; } else { sign[i] = 0; // not really required, but compiler warns about uninitialized values } } saoinfo.sao_band_position[cIdx] = decode_sao_band_position(tctx); } else { uint8_t SaoEoClass = 0; sign[0] = sign[1] = 1; sign[2] = sign[3] = -1; if (cIdx == 0) { saoinfo.SaoEoClass = SaoEoClass = decode_sao_class(tctx); } else if (cIdx == 1) { SaoEoClass = decode_sao_class(tctx); saoinfo.SaoEoClass |= SaoEoClass << (2 * 1); saoinfo.SaoEoClass |= SaoEoClass << (2 * 2); } logtrace(LogSlice, "SaoEoClass[%d] = %d\n", cIdx, SaoEoClass); } int log2OffsetScale; if (cIdx == 0) { log2OffsetScale = pps.range_extension.log2_sao_offset_scale_luma; } else { log2OffsetScale = pps.range_extension.log2_sao_offset_scale_chroma; } for (int i = 0; i < 4; i++) { saoinfo.saoOffsetVal[cIdx][i] = sign[i] * (saoinfo.saoOffsetVal[cIdx][i] << log2OffsetScale); } } } } img->set_sao_info(xCtb, yCtb, &saoinfo); } if (sao_merge_left_flag) { img->set_sao_info(xCtb, yCtb, img->get_sao_info(xCtb - 1, yCtb)); } if (sao_merge_up_flag) { img->set_sao_info(xCtb, yCtb, img->get_sao_info(xCtb, yCtb - 1)); } } void read_coding_tree_unit(thread_context* tctx) { slice_segment_header* shdr = tctx->shdr; de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); int xCtb = (tctx->CtbAddrInRS % sps.PicWidthInCtbsY); int yCtb = (tctx->CtbAddrInRS / sps.PicWidthInCtbsY); int xCtbPixels = xCtb << sps.Log2CtbSizeY; int yCtbPixels = yCtb << sps.Log2CtbSizeY; logtrace(LogSlice, "----- decode CTB %d;%d (%d;%d) POC=%d, SliceAddrRS=%d\n", xCtbPixels, yCtbPixels, xCtb, yCtb, tctx->img->PicOrderCntVal, tctx->shdr->SliceAddrRS); img->set_SliceAddrRS(xCtb, yCtb, tctx->shdr->SliceAddrRS); img->set_SliceHeaderIndex(xCtbPixels, yCtbPixels, shdr->slice_index); int CtbAddrInSliceSeg = tctx->CtbAddrInRS - shdr->slice_segment_address; if (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag) { read_sao(tctx, xCtb, yCtb, CtbAddrInSliceSeg); } read_coding_quadtree(tctx, xCtbPixels, yCtbPixels, sps.Log2CtbSizeY, 0); } LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(const seq_parameter_set* sps, int x, int y) { int ctbX = x >> sps->Log2CtbSizeY; int ctbY = y >> sps->Log2CtbSizeY; return ctbY * sps->PicWidthInCtbsY + ctbX; } int check_CTB_available(const de265_image* img, int xC, int yC, int xN, int yN) { // check whether neighbor is outside of frame if (xN < 0 || yN < 0) { return 0; } if (xN >= img->get_sps().pic_width_in_luma_samples) { return 0; } if (yN >= img->get_sps().pic_height_in_luma_samples) { return 0; } int current_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->get_sps(), xC, yC); int neighbor_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->get_sps(), xN, yN); // TODO: check if this is correct (6.4.1) if (img->get_SliceAddrRS_atCtbRS(current_ctbAddrRS) != img->get_SliceAddrRS_atCtbRS(neighbor_ctbAddrRS)) { return 0; } // check if both CTBs are in the same tile. if (img->get_pps().TileIdRS[current_ctbAddrRS] != img->get_pps().TileIdRS[neighbor_ctbAddrRS]) { return 0; } return 1; } int residual_coding(thread_context* tctx, int x0, int y0, // position of TU in frame int log2TrafoSize, int cIdx) { logtrace(LogSlice, "- residual_coding x0:%d y0:%d log2TrafoSize:%d cIdx:%d\n", x0, y0, log2TrafoSize, cIdx); //slice_segment_header* shdr = tctx->shdr; de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); const pic_parameter_set& pps = img->get_pps(); enum PredMode PredMode = img->get_pred_mode(x0, y0); if (cIdx == 0) { img->set_nonzero_coefficient(x0, y0, log2TrafoSize); } if (pps.transform_skip_enabled_flag && !tctx->cu_transquant_bypass_flag && (log2TrafoSize <= pps.Log2MaxTransformSkipSize)) { tctx->transform_skip_flag[cIdx] = decode_transform_skip_flag(tctx, cIdx); } else { tctx->transform_skip_flag[cIdx] = 0; } tctx->explicit_rdpcm_flag = false; if (PredMode == MODE_INTER && sps.range_extension.explicit_rdpcm_enabled_flag && (tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag)) { tctx->explicit_rdpcm_flag = decode_explicit_rdpcm_flag(tctx, cIdx); if (tctx->explicit_rdpcm_flag) { tctx->explicit_rdpcm_dir = decode_explicit_rdpcm_dir(tctx, cIdx); } //printf("EXPLICIT RDPCM %d;%d\n",x0,y0); } else { tctx->explicit_rdpcm_flag = false; } // sbType for persistent_rice_adaptation_enabled_flag int sbType = (cIdx == 0) ? 2 : 0; if (tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag) { sbType++; } // --- decode position of last coded coefficient --- int last_significant_coeff_x_prefix = decode_last_significant_coeff_prefix(tctx, log2TrafoSize, cIdx, &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX]); int last_significant_coeff_y_prefix = decode_last_significant_coeff_prefix(tctx, log2TrafoSize, cIdx, &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX]); // TODO: we can combine both FL-bypass calls into one, but the gain may be limited... int LastSignificantCoeffX; if (last_significant_coeff_x_prefix > 3) { int nBits = (last_significant_coeff_x_prefix >> 1) - 1; int last_significant_coeff_x_suffix = tctx->cabac_decoder.decode_FL_bypass( nBits); LastSignificantCoeffX = ((2 + (last_significant_coeff_x_prefix & 1)) << nBits) + last_significant_coeff_x_suffix; } else { LastSignificantCoeffX = last_significant_coeff_x_prefix; } int LastSignificantCoeffY; if (last_significant_coeff_y_prefix > 3) { int nBits = (last_significant_coeff_y_prefix >> 1) - 1; int last_significant_coeff_y_suffix = tctx->cabac_decoder.decode_FL_bypass( nBits); LastSignificantCoeffY = ((2 + (last_significant_coeff_y_prefix & 1)) << nBits) + last_significant_coeff_y_suffix; } else { LastSignificantCoeffY = last_significant_coeff_y_prefix; } // --- determine scanIdx --- int scanIdx; if (PredMode == MODE_INTRA) { if (cIdx == 0) { scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredMode(x0, y0), cIdx, &sps); //printf("luma scan idx=%d <- intra mode=%d\n",scanIdx, img->get_IntraPredMode(x0,y0)); } else { scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredModeC(x0, y0), cIdx, &sps); //printf("chroma scan idx=%d <- intra mode=%d chroma:%d trsize:%d\n",scanIdx, // img->get_IntraPredModeC(x0,y0), sps->chroma_format_idc, 1<nCoeff[cIdx] = 0; // i - subblock index // n - coefficient index in subblock for (int i = lastSubBlock; i >= 0; i--) { position S = ScanOrderSub[i]; int inferSbDcSigCoeffFlag = 0; logtrace(LogSlice, "sub block scan idx: %d\n", i); // --- check whether this sub-block is coded --- int sub_block_is_coded = 0; if ((i < lastSubBlock) && (i > 0)) { sub_block_is_coded = decode_coded_sub_block_flag(tctx, cIdx, coded_sub_block_neighbors[S.x + S.y * sbWidth]); inferSbDcSigCoeffFlag = 1; } else if (i == 0 || i == lastSubBlock) { // first (DC) and last sub-block are always coded // - the first will most probably contain coefficients // - the last obviously contains the last coded coefficient sub_block_is_coded = 1; } if (sub_block_is_coded) { if (S.x > 0) coded_sub_block_neighbors[S.x - 1 + S.y * sbWidth] |= 1; if (S.y > 0) coded_sub_block_neighbors[S.x + (S.y - 1) * sbWidth] |= 2; } // ----- find significant coefficients in this sub-block ----- int16_t coeff_value[16]; int8_t coeff_scan_pos[16]; int8_t coeff_sign[16]; int8_t coeff_has_max_base_level[16]; int nCoefficients = 0; if (sub_block_is_coded) { int x0 = S.x << 2; int y0 = S.y << 2; int log2w = log2TrafoSize - 2; int prevCsbf = coded_sub_block_neighbors[S.x + S.y * sbWidth]; uint8_t* ctxIdxMap = ctxIdxLookup[log2w][!!cIdx][!!scanIdx][prevCsbf]; logdebug(LogSlice, "log2w:%d cIdx:%d scanIdx:%d prevCsbf:%d\n", log2w, cIdx, scanIdx, prevCsbf); // set the last coded coefficient in the last subblock int last_coeff = (i == lastSubBlock) ? lastScanPos - 1 : 15; if (i == lastSubBlock) { coeff_value[nCoefficients] = 1; coeff_has_max_base_level[nCoefficients] = 1; coeff_scan_pos[nCoefficients] = lastScanPos; nCoefficients++; } // --- decode all coefficients' significant_coeff flags except for the DC coefficient --- for (int n = last_coeff; n > 0; n--) { int subX = ScanOrderPos[n].x; int subY = ScanOrderPos[n].y; xC = x0 + subX; yC = y0 + subY; // for all AC coefficients in sub-block, a significant_coeff flag is coded int ctxInc; if (sps.range_extension.transform_skip_context_enabled_flag && (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) { ctxInc = (cIdx == 0) ? 42 : (16 + 27); } else { ctxInc = ctxIdxMap[xC + (yC << log2TrafoSize)]; } logtrace(LogSlice, "trafoSize: %d\n", 1 << log2TrafoSize); int significant_coeff = decode_significant_coeff_flag_lookup(tctx, ctxInc); if (significant_coeff) { coeff_value[nCoefficients] = 1; coeff_has_max_base_level[nCoefficients] = 1; coeff_scan_pos[nCoefficients] = n; nCoefficients++; // since we have a coefficient in the sub-block, // we cannot infer the DC coefficient anymore inferSbDcSigCoeffFlag = 0; } } // --- decode DC coefficient significance --- if (last_coeff >= 0) // last coded coefficient (always set to 1) is not the DC coefficient { if (inferSbDcSigCoeffFlag == 0) { // if we cannot infert the DC coefficient, it is coded int ctxInc; if (sps.range_extension.transform_skip_context_enabled_flag && (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) { ctxInc = (cIdx == 0) ? 42 : (16 + 27); } else { ctxInc = ctxIdxMap[x0 + (y0 << log2TrafoSize)]; } int significant_coeff = decode_significant_coeff_flag_lookup(tctx, ctxInc); if (significant_coeff) { coeff_value[nCoefficients] = 1; coeff_has_max_base_level[nCoefficients] = 1; coeff_scan_pos[nCoefficients] = 0; nCoefficients++; } } else { // we can infer that the DC coefficient must be present coeff_value[nCoefficients] = 1; coeff_has_max_base_level[nCoefficients] = 1; coeff_scan_pos[nCoefficients] = 0; nCoefficients++; } } } /* logtrace(LogSlice,"significant_coeff_flags:\n"); for (int y=0;y<4;y++) { logtrace(LogSlice," "); for (int x=0;x<4;x++) { logtrace(LogSlice,"*%d ",significant_coeff_flag[y][x]); } logtrace(LogSlice,"*\n"); } */ if (nCoefficients) { int ctxSet; if (i == 0 || cIdx > 0) { ctxSet = 0; } else { ctxSet = 2; } if (c1 == 0) { ctxSet++; } c1 = 1; // --- decode greater-1 flags --- int newLastGreater1ScanPos = -1; int lastGreater1Coefficient = libde265_min(8, nCoefficients); for (int c = 0; c < lastGreater1Coefficient; c++) { int greater1_flag = decode_coeff_abs_level_greater1(tctx, cIdx, i, c == 0, firstSubblock, lastSubblock_greater1Ctx, &lastInvocation_greater1Ctx, &lastInvocation_coeff_abs_level_greater1_flag, &lastInvocation_ctxSet, ctxSet); if (greater1_flag) { coeff_value[c]++; c1 = 0; if (newLastGreater1ScanPos == -1) { newLastGreater1ScanPos = c; } } else { coeff_has_max_base_level[c] = 0; if (c1 < 3 && c1 > 0) { c1++; } } } firstSubblock = false; lastSubblock_greater1Ctx = lastInvocation_greater1Ctx; // --- decode greater-2 flag --- if (newLastGreater1ScanPos != -1) { int flag = decode_coeff_abs_level_greater2(tctx, cIdx, lastInvocation_ctxSet); coeff_value[newLastGreater1ScanPos] += flag; coeff_has_max_base_level[newLastGreater1ScanPos] = flag; } // --- decode coefficient signs --- int signHidden; IntraPredMode predModeIntra; if (cIdx == 0) predModeIntra = img->get_IntraPredMode(x0, y0); else predModeIntra = img->get_IntraPredModeC(x0, y0); if (tctx->cu_transquant_bypass_flag || (PredMode == MODE_INTRA && sps.range_extension.implicit_rdpcm_enabled_flag && tctx->transform_skip_flag[cIdx] && (predModeIntra == 10 || predModeIntra == 26)) || tctx->explicit_rdpcm_flag) { signHidden = 0; } else { signHidden = (coeff_scan_pos[0] - coeff_scan_pos[nCoefficients - 1] > 3); } for (int n = 0; n < nCoefficients - 1; n++) { coeff_sign[n] = tctx->cabac_decoder.decode_bypass(); logtrace(LogSlice, "sign[%d] = %d\n", n, coeff_sign[n]); } // n==nCoefficients-1 if (!pps.sign_data_hiding_flag || !signHidden) { coeff_sign[nCoefficients - 1] = tctx->cabac_decoder.decode_bypass(); logtrace(LogSlice, "sign[%d] = %d\n", nCoefficients - 1, coeff_sign[nCoefficients - 1]); } else { coeff_sign[nCoefficients - 1] = 0; } // --- decode coefficient value --- int sumAbsLevel = 0; int uiGoRiceParam; if (sps.range_extension.persistent_rice_adaptation_enabled_flag == 0) { uiGoRiceParam = 0; } else { uiGoRiceParam = tctx->StatCoeff[sbType] / 4; } // printf("initial uiGoRiceParam=%d\n",uiGoRiceParam); bool firstCoeffWithAbsLevelRemaining = true; for (int n = 0; n < nCoefficients; n++) { int16_t baseLevel = coeff_value[n]; int32_t coeff_abs_level_remaining; // printf("coeff %d/%d, uiRiceParam: %d\n",n,nCoefficients,uiGoRiceParam); if (coeff_has_max_base_level[n]) { coeff_abs_level_remaining = decode_coeff_abs_level_remaining(tctx, uiGoRiceParam); if (sps.range_extension.persistent_rice_adaptation_enabled_flag == 0) { // (2014.10 / 9-20) if (baseLevel + coeff_abs_level_remaining > 3 * (1 << uiGoRiceParam)) { uiGoRiceParam++; if (uiGoRiceParam > 4) uiGoRiceParam = 4; } } else { if (baseLevel + coeff_abs_level_remaining > 3 * (1 << uiGoRiceParam)) uiGoRiceParam++; } // persistent_rice_adaptation_enabled_flag if (sps.range_extension.persistent_rice_adaptation_enabled_flag && firstCoeffWithAbsLevelRemaining) { if (coeff_abs_level_remaining >= (3 << (tctx->StatCoeff[sbType] / 4))) { tctx->StatCoeff[sbType]++; } else if (2 * coeff_abs_level_remaining < (1 << (tctx->StatCoeff[sbType] / 4)) && tctx->StatCoeff[sbType] > 0) { tctx->StatCoeff[sbType]--; } } firstCoeffWithAbsLevelRemaining = false; } else { coeff_abs_level_remaining = 0; } logtrace(LogSlice, "coeff_abs_level_remaining=%d\n", coeff_abs_level_remaining); int32_t currCoeff = baseLevel + coeff_abs_level_remaining; if (coeff_sign[n]) { currCoeff = -currCoeff; } if (pps.sign_data_hiding_flag && signHidden) { sumAbsLevel += currCoeff; if (n == nCoefficients - 1 && (sumAbsLevel & 1)) { currCoeff = -currCoeff; } } logtrace(LogSlice, "quantized coefficient=%d\n", currCoeff); #ifdef DE265_LOG_TRACE //TransCoeffLevel[yC*CoeffStride + xC] = currCoeff; #endif // put coefficient in list int p = coeff_scan_pos[n]; xC = (S.x << 2) + ScanOrderPos[p].x; yC = (S.y << 2) + ScanOrderPos[p].y; tctx->coeffList[cIdx][tctx->nCoeff[cIdx]] = Clip3(-32768, 32767, currCoeff); tctx->coeffPos[cIdx][tctx->nCoeff[cIdx]] = xC + yC * CoeffStride; tctx->nCoeff[cIdx]++; //printf("%d ",currCoeff); } // iterate through coefficients in sub-block //printf(" (%d;%d)\n",x0,y0); } // if nonZero } // next sub-block return DE265_OK; } static void decode_TU(thread_context* tctx, int x0, int y0, int xCUBase, int yCUBase, int nT, int cIdx, enum PredMode cuPredMode, bool cbf) { de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); int residualDpcm = 0; if (cuPredMode == MODE_INTRA) // if intra mode { enum IntraPredMode intraPredMode; if (cIdx == 0) { intraPredMode = img->get_IntraPredMode(x0, y0); } else { const int SubWidthC = sps.SubWidthC; const int SubHeightC = sps.SubHeightC; intraPredMode = img->get_IntraPredModeC(x0 * SubWidthC, y0 * SubHeightC); } if (intraPredMode < 0 || intraPredMode >= 35) { // TODO: ERROR intraPredMode = INTRA_DC; } decode_intra_prediction(img, x0, y0, intraPredMode, nT, cIdx); residualDpcm = sps.range_extension.implicit_rdpcm_enabled_flag && (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx]) && (intraPredMode == 10 || intraPredMode == 26); if (residualDpcm && intraPredMode == 26) residualDpcm = 2; } else // INTER { if (tctx->explicit_rdpcm_flag) { residualDpcm = (tctx->explicit_rdpcm_dir ? 2 : 1); } } if (cbf) { scale_coefficients(tctx, x0, y0, xCUBase, yCUBase, nT, cIdx, tctx->transform_skip_flag[cIdx], cuPredMode == MODE_INTRA, residualDpcm); } /* else if (!cbf && cIdx==0) { memset(tctx->residual_luma,0,32*32*sizeof(int32_t)); } */ else if (!cbf && cIdx != 0 && tctx->ResScaleVal) { // --- cross-component-prediction when CBF==0 --- tctx->nCoeff[cIdx] = 0; residualDpcm = 0; scale_coefficients(tctx, x0, y0, xCUBase, yCUBase, nT, cIdx, tctx->transform_skip_flag[cIdx], cuPredMode == MODE_INTRA, residualDpcm); } } static int decode_log2_res_scale_abs_plus1(thread_context* tctx, int cIdxMinus1) { //const int context = (cIdx==0) ? 0 : 1; logtrace(LogSlice, "# log2_res_scale_abs_plus1 (c=%d)\n", cIdxMinus1); int value = 0; int cMax = 4; for (int binIdx = 0; binIdx < cMax; binIdx++) { int ctxIdxInc = 4 * cIdxMinus1 + binIdx; int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 + ctxIdxInc]); if (!bit) break; value++; } logtrace(LogSymbols, "$1 log2_res_scale_abs_plus1=%d\n", value); return value; } static int decode_res_scale_sign_flag(thread_context* tctx, int cIdxMinus1) { //const int context = (cIdx==0) ? 0 : 1; logtrace(LogSlice, "# res_scale_sign_flag (c=%d)\n", cIdxMinus1); int bit = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_RES_SCALE_SIGN_FLAG + cIdxMinus1]); logtrace(LogSymbols, "$1 res_scale_sign_flag=%d\n", bit); return bit; } static void read_cross_comp_pred(thread_context* tctx, int cIdxMinus1) { int log2_res_scale_abs_plus1 = decode_log2_res_scale_abs_plus1(tctx, cIdxMinus1); int ResScaleVal; if (log2_res_scale_abs_plus1 != 0) { int res_scale_sign_flag = decode_res_scale_sign_flag(tctx, cIdxMinus1); ResScaleVal = 1 << (log2_res_scale_abs_plus1 - 1); ResScaleVal *= 1 - 2 * res_scale_sign_flag; } else { ResScaleVal = 0; } tctx->ResScaleVal = ResScaleVal; } int read_transform_unit(thread_context* tctx, int x0, int y0, // position of TU in frame int xBase, int yBase, // position of parent TU in frame int xCUBase, int yCUBase, // position of CU in frame int log2TrafoSize, int trafoDepth, int blkIdx, int cbf_luma, int cbf_cb, int cbf_cr) { logtrace(LogSlice, "- read_transform_unit x0:%d y0:%d xBase:%d yBase:%d nT:%d cbf:%d:%d:%d\n", x0, y0, xBase, yBase, 1 << log2TrafoSize, cbf_luma, cbf_cb, cbf_cr); assert(cbf_cb != -1); assert(cbf_cr != -1); assert(cbf_luma != -1); const seq_parameter_set& sps = tctx->img->get_sps(); const int ChromaArrayType = sps.ChromaArrayType; int log2TrafoSizeC = (ChromaArrayType == CHROMA_444 ? log2TrafoSize : log2TrafoSize - 1); log2TrafoSizeC = libde265_max(2, log2TrafoSizeC); const int cbfLuma = cbf_luma; const int cbfChroma = cbf_cb | cbf_cr; tctx->transform_skip_flag[0] = 0; tctx->transform_skip_flag[1] = 0; tctx->transform_skip_flag[2] = 0; tctx->explicit_rdpcm_flag = false; enum PredMode cuPredMode = tctx->img->get_pred_mode(x0, y0); if (cbfLuma || cbfChroma) { bool doDecodeQuantParameters = false; if (tctx->img->get_pps().cu_qp_delta_enabled_flag && !tctx->IsCuQpDeltaCoded) { uint8_t cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx); if (cu_qp_delta_abs == CABAC_QP_DELTA_ABS_ERROR) { tctx->decctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } int cu_qp_delta_sign = 0; if (cu_qp_delta_abs) { cu_qp_delta_sign = tctx->cabac_decoder.decode_bypass(); } // CuQpDeltaVal shall be in [-(26 + QpBdOffsetY/2), 25 + QpBdOffsetY/2] (Sec. 7.4.9.10) int maxCuQpDeltaAbs = 25 + tctx->img->get_sps().QpBdOffset_Y / 2; if (cu_qp_delta_abs > maxCuQpDeltaAbs) { tctx->decctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } tctx->IsCuQpDeltaCoded = 1; tctx->CuQpDelta = cu_qp_delta_abs * (1 - 2 * cu_qp_delta_sign); //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta); logtrace(LogSlice, "cu_qp_delta_abs = %d\n", cu_qp_delta_abs); logtrace(LogSlice, "cu_qp_delta_sign = %d\n", cu_qp_delta_sign); logtrace(LogSlice, "CuQpDelta = %d\n", tctx->CuQpDelta); doDecodeQuantParameters = true; //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); } if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && cbfChroma && !tctx->cu_transquant_bypass_flag && !tctx->IsCuChromaQpOffsetCoded) { logtrace(LogSlice, "# cu_chroma_qp_offset_flag\n"); int cu_chroma_qp_offset_flag = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG]); const pic_parameter_set& pps = tctx->img->get_pps(); int cu_chroma_qp_offset_idx = 0; if (cu_chroma_qp_offset_flag && pps.range_extension.chroma_qp_offset_list_len > 1) { cu_chroma_qp_offset_idx = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX]); } tctx->IsCuChromaQpOffsetCoded = 1; if (cu_chroma_qp_offset_flag) { tctx->CuQpOffsetCb = pps.range_extension.cb_qp_offset_list[cu_chroma_qp_offset_idx]; tctx->CuQpOffsetCr = pps.range_extension.cr_qp_offset_list[cu_chroma_qp_offset_idx]; } else { tctx->CuQpOffsetCb = 0; tctx->CuQpOffsetCr = 0; } doDecodeQuantParameters = true; //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); } if (doDecodeQuantParameters) { decode_quantization_parameters(tctx, x0, y0, xCUBase, yCUBase); } } // position of TU in local CU //int xL = x0 - xCUBase; //int yL = y0 - yCUBase; int nT = 1 << log2TrafoSize; int nTC = 1 << log2TrafoSizeC; const int SubWidthC = sps.SubWidthC; const int SubHeightC = sps.SubHeightC; // --- luma --- tctx->ResScaleVal = 0; int err; if (cbf_luma) { if ((err = residual_coding(tctx, x0, y0, log2TrafoSize, 0)) != DE265_OK) return err; } decode_TU(tctx, x0, y0, xCUBase, yCUBase, nT, 0, cuPredMode, cbf_luma); // --- chroma --- //const int yOffset422 = 1< 2 || ChromaArrayType == CHROMA_444) { // TODO: cross-component prediction const bool do_cross_component_prediction = (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag && cbf_luma && (cuPredMode == MODE_INTER || tctx->img->is_IntraPredModeC_Mode4(x0, y0))); if (do_cross_component_prediction) { read_cross_comp_pred(tctx, 0); } else { tctx->ResScaleVal = 0; } { if (cbf_cb & 1) { if ((err = residual_coding(tctx, x0, y0, log2TrafoSizeC, 1)) != DE265_OK) return err; } if (sps.ChromaArrayType != CHROMA_MONO) { decode_TU(tctx, x0 / SubWidthC, y0 / SubHeightC, xCUBase / SubWidthC, yCUBase / SubHeightC, nTC, 1, cuPredMode, cbf_cb & 1); } } // 4:2:2 if (ChromaArrayType == CHROMA_422) { const int yOffset = 1 << log2TrafoSizeC; if (cbf_cb & 2) { if ((err = residual_coding(tctx, x0, y0 + yOffset * SubHeightC, log2TrafoSizeC, 1)) != DE265_OK) return err; } decode_TU(tctx, x0 / SubWidthC, y0 / SubHeightC + yOffset, xCUBase / SubWidthC, yCUBase / SubHeightC + yOffset, nTC, 1, cuPredMode, cbf_cb & 2); } if (do_cross_component_prediction) { read_cross_comp_pred(tctx, 1); } else { tctx->ResScaleVal = 0; } { if (cbf_cr & 1) { if ((err = residual_coding(tctx, x0, y0, log2TrafoSizeC, 2)) != DE265_OK) return err; } if (sps.ChromaArrayType != CHROMA_MONO) { decode_TU(tctx, x0 / SubWidthC, y0 / SubHeightC, xCUBase / SubWidthC, yCUBase / SubHeightC, nTC, 2, cuPredMode, cbf_cr & 1); } } // 4:2:2 if (ChromaArrayType == CHROMA_422) { const int yOffset = 1 << log2TrafoSizeC; if (cbf_cr & 2) { if ((err = residual_coding(tctx, x0, y0 + yOffset * SubHeightC, log2TrafoSizeC, 2)) != DE265_OK) return err; } decode_TU(tctx, x0 / SubWidthC, y0 / SubHeightC + yOffset, xCUBase / SubWidthC, yCUBase / SubHeightC + yOffset, nTC, 2, cuPredMode, cbf_cr & 2); } } else if (blkIdx == 3) { if (cbf_cb & 1) { if ((err = residual_coding(tctx, xBase, yBase, log2TrafoSize, 1)) != DE265_OK) return err; } if (sps.ChromaArrayType != CHROMA_MONO) { decode_TU(tctx, xBase / SubWidthC, yBase / SubHeightC, xCUBase / SubWidthC, yCUBase / SubHeightC, nT, 1, cuPredMode, cbf_cb & 1); } // 4:2:2 if (cbf_cb & 2) { if ((err = residual_coding(tctx, xBase, yBase + (1 << log2TrafoSize), log2TrafoSize, 1)) != DE265_OK) return err; } if (ChromaArrayType == CHROMA_422) { decode_TU(tctx, xBase / SubWidthC, yBase / SubHeightC + (1 << log2TrafoSize), xCUBase / SubWidthC, yCUBase / SubHeightC, nT, 1, cuPredMode, cbf_cb & 2); } if (cbf_cr & 1) { if ((err = residual_coding(tctx, xBase, yBase, log2TrafoSize, 2)) != DE265_OK) return err; } if (sps.ChromaArrayType != CHROMA_MONO) { decode_TU(tctx, xBase / SubWidthC, yBase / SubHeightC, xCUBase / SubWidthC, yCUBase / SubHeightC, nT, 2, cuPredMode, cbf_cr & 1); } // 4:2:2 if (cbf_cr & 2) { if ((err = residual_coding(tctx, xBase, yBase + (1 << log2TrafoSizeC), log2TrafoSize, 2)) != DE265_OK) return err; } if (ChromaArrayType == CHROMA_422) { decode_TU(tctx, xBase / SubWidthC, yBase / SubHeightC + (1 << log2TrafoSize), xCUBase / SubWidthC, yCUBase / SubHeightC, nT, 2, cuPredMode, cbf_cr & 2); } } return DE265_OK; } #if 0 static void dump_cbsize(de265_image* img) { int w = img->get_width(0); int h = img->get_height(0); for (int y = 0; y < h; y += 8) { for (int x = 0; x < w; x += 8) { printf("%d", img->get_log2CbSize(x, y)); } printf("\n"); } } #endif void read_transform_tree(thread_context* tctx, int x0, int y0, // position of TU in frame int xBase, int yBase, // position of parent TU in frame int xCUBase, int yCUBase, // position of CU in frame int log2TrafoSize, int trafoDepth, int blkIdx, int MaxTrafoDepth, int IntraSplitFlag, enum PredMode cuPredMode, uint8_t parent_cbf_cb, uint8_t parent_cbf_cr) { logtrace(LogSlice, "- read_transform_tree (interleaved) x0:%d y0:%d xBase:%d yBase:%d " "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d parent-cbf-cb:%d parent-cbf-cr:%d\n", x0, y0, xBase, yBase, log2TrafoSize, trafoDepth, MaxTrafoDepth, parent_cbf_cb, parent_cbf_cr); de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); int split_transform_flag; enum PredMode PredMode = img->get_pred_mode(x0, y0); assert(PredMode == cuPredMode); /* If TrafoSize is larger than maximum size -> split automatically If TrafoSize is at minimum size -> do not split If maximum transformation depth is reached -> do not split If intra-prediction is NxN mode -> split automatically (only at level 0) Otherwise -> read split flag */ if (log2TrafoSize <= sps.Log2MaxTrafoSize && log2TrafoSize > sps.Log2MinTrafoSize && trafoDepth < MaxTrafoDepth && !(IntraSplitFlag && trafoDepth == 0)) { split_transform_flag = decode_split_transform_flag(tctx, log2TrafoSize); } else { enum PartMode PartMode = img->get_PartMode(x0, y0); int interSplitFlag = (sps.max_transform_hierarchy_depth_inter == 0 && trafoDepth == 0 && PredMode == MODE_INTER && PartMode != PART_2Nx2N); split_transform_flag = (log2TrafoSize > sps.Log2MaxTrafoSize || (IntraSplitFlag == 1 && trafoDepth == 0) || interSplitFlag == 1) ? 1 : 0; } if (split_transform_flag && log2TrafoSize <= sps.Log2MinTrafoSize) { // TODO: it would be nice to have a flag "ignore_subsequent_errors" since the stream cannot be successfully decoded // after a bitstream error like this. But that would require that the error_queue is independent for each decoding thread // and that the flag is reset at a CABAC synchronization point. An alternative would be to simply stop the decoding this slice // after such an error. img->decctx->add_warning(DE265_WARNING_INVALID_TU_BLOCK_SPLIT, true); split_transform_flag = 0; } if (split_transform_flag) { logtrace(LogSlice, "set_split_transform_flag(%d,%d, %d)\n", x0, y0, trafoDepth); img->set_split_transform_flag(x0, y0, trafoDepth); } int cbf_cb = -1; int cbf_cr = -1; // CBF_CB/CR flags are encoded like this: // 4:2:0 and 4:4:4 modes: binary flag in bit 0 // 4:2:2 mode: bit 0: top block, bit 1: bottom block if ((log2TrafoSize > 2 && sps.ChromaArrayType != CHROMA_MONO) || sps.ChromaArrayType == CHROMA_444) { // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0 if (/*trafoDepth==0 ||*/ parent_cbf_cb) { cbf_cb = decode_cbf_chroma(tctx, trafoDepth); if (sps.ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize == 3)) { cbf_cb |= (decode_cbf_chroma(tctx, trafoDepth) << 1); } } // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0 if (/*trafoDepth==0 ||*/ parent_cbf_cr) { cbf_cr = decode_cbf_chroma(tctx, trafoDepth); if (sps.ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize == 3)) { cbf_cr |= (decode_cbf_chroma(tctx, trafoDepth) << 1); } } } //printf("CBF: cb:%d cr:%d\n",cbf_cb,cbf_cr); // cbf_cr/cbf_cb not present in bitstream -> induce values if (cbf_cb < 0) { assert(!(trafoDepth==0 && log2TrafoSize==2)); /* The standard specifies to check trafoDepth>0 AND log2TrafoSize==2. However, I think that trafoDepth>0 is redundant as a CB is always at least 8x8 and hence trafoDepth>0. */ if (trafoDepth > 0 && log2TrafoSize == 2) { cbf_cb = parent_cbf_cb; } else { cbf_cb = 0; } } if (cbf_cr < 0) { if (trafoDepth > 0 && log2TrafoSize == 2) { cbf_cr = parent_cbf_cr; } else { cbf_cr = 0; } } if (split_transform_flag) { int x1 = x0 + (1 << (log2TrafoSize - 1)); int y1 = y0 + (1 << (log2TrafoSize - 1)); logtrace(LogSlice, "transform split.\n"); read_transform_tree(tctx, x0, y0, x0, y0, xCUBase, yCUBase, log2TrafoSize - 1, trafoDepth + 1, 0, MaxTrafoDepth, IntraSplitFlag, cuPredMode, cbf_cb, cbf_cr); read_transform_tree(tctx, x1, y0, x0, y0, xCUBase, yCUBase, log2TrafoSize - 1, trafoDepth + 1, 1, MaxTrafoDepth, IntraSplitFlag, cuPredMode, cbf_cb, cbf_cr); read_transform_tree(tctx, x0, y1, x0, y0, xCUBase, yCUBase, log2TrafoSize - 1, trafoDepth + 1, 2, MaxTrafoDepth, IntraSplitFlag, cuPredMode, cbf_cb, cbf_cr); read_transform_tree(tctx, x1, y1, x0, y0, xCUBase, yCUBase, log2TrafoSize - 1, trafoDepth + 1, 3, MaxTrafoDepth, IntraSplitFlag, cuPredMode, cbf_cb, cbf_cr); } else { int cbf_luma; if (PredMode == MODE_INTRA || trafoDepth != 0 || cbf_cb || cbf_cr) { cbf_luma = decode_cbf_luma(tctx, trafoDepth); } else { /* There cannot be INTER blocks with no residual data. That case is already handled with rqt_root_cbf. */ cbf_luma = 1; } logtrace(LogSlice, "call read_transform_unit %d/%d\n", x0, y0); read_transform_unit(tctx, x0, y0, xBase, yBase, xCUBase, yCUBase, log2TrafoSize, trafoDepth, blkIdx, cbf_luma, cbf_cb, cbf_cr); } } const char* part_mode_name(enum PartMode pm) { switch (pm) { case PART_2Nx2N: return "2Nx2N"; case PART_2NxN: return "2NxN"; case PART_Nx2N: return "Nx2N"; case PART_NxN: return "NxN"; case PART_2NxnU: return "2NxnU"; case PART_2NxnD: return "2NxnD"; case PART_nLx2N: return "nLx2N"; case PART_nRx2N: return "nRx2N"; } return "undefined part mode"; } void read_mvd_coding(thread_context* tctx, int x0, int y0, int refList) { int abs_mvd_greater0_flag[2]; abs_mvd_greater0_flag[0] = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 0]); abs_mvd_greater0_flag[1] = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 0]); int abs_mvd_greater1_flag[2]; if (abs_mvd_greater0_flag[0]) { abs_mvd_greater1_flag[0] = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 1]); } else { abs_mvd_greater1_flag[0] = 0; } if (abs_mvd_greater0_flag[1]) { abs_mvd_greater1_flag[1] = tctx->cabac_decoder.decode_bit( &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 1]); } else { abs_mvd_greater1_flag[1] = 0; } int mvd_sign_flag[2]; int16_t value[2]; for (int c = 0; c < 2; c++) { if (abs_mvd_greater0_flag[c]) { int32_t absMvd; if (abs_mvd_greater1_flag[c]) { uint32_t abs_mvd_minus2 = tctx->cabac_decoder.decode_EGk_bypass( 1); // MVD is clipped to [-32768, 32767], so cap abs value at 32768 absMvd = static_cast(std::min(abs_mvd_minus2, uint32_t{32768 - 2})) + 2; } else { absMvd = 1; } mvd_sign_flag[c] = tctx->cabac_decoder.decode_bypass(); int32_t mvd = mvd_sign_flag[c] ? -absMvd : absMvd; value[c] = Clip3(-32768, 32767, mvd); } else { value[c] = 0; } } tctx->motion.mvd[refList][0] = value[0]; tctx->motion.mvd[refList][1] = value[1]; logtrace(LogSlice, "MVD[%d;%d|%d] = %d;%d\n", x0, y0, refList, value[0], value[1]); } void read_prediction_unit_SKIP(thread_context* tctx, int x0, int y0, int nPbW, int nPbH) { int merge_idx = decode_merge_idx(tctx); tctx->motion.merge_idx = merge_idx; tctx->motion.merge_flag = true; logtrace(LogSlice, "prediction skip 2Nx2N, merge_idx: %d\n", merge_idx); } /* xC/yC : CB position xB/yB : position offset of the PB nPbW/nPbH : size of PB nCS : CB size */ void read_prediction_unit(thread_context* tctx, int xC, int yC, int xB, int yB, int nPbW, int nPbH, int ctDepth, int nCS, int partIdx) { logtrace(LogSlice, "read_prediction_unit %d;%d %dx%d\n", xC + xB, yC + xB, nPbW, nPbH); int x0 = xC + xB; int y0 = yC + yB; slice_segment_header* shdr = tctx->shdr; int merge_flag = decode_merge_flag(tctx); tctx->motion.merge_flag = merge_flag; if (merge_flag) { int merge_idx = decode_merge_idx(tctx); logtrace(LogSlice, "prediction unit %d,%d, merge mode, index: %d\n", x0, y0, merge_idx); tctx->motion.merge_idx = merge_idx; } else { // no merge flag enum InterPredIdc inter_pred_idc; if (shdr->slice_type == SLICE_TYPE_B) { inter_pred_idc = decode_inter_pred_idc(tctx, x0, y0, nPbW, nPbH, ctDepth); } else { inter_pred_idc = PRED_L0; } tctx->motion.inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc); if (inter_pred_idc != PRED_L1) { int ref_idx_l0 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l0_active); // NOTE: case for only one reference frame is handles in decode_ref_idx_lX() if (ref_idx_l0 < 0 || ref_idx_l0 >= MAX_NUM_REF_PICS) { tctx->img->integrity = INTEGRITY_DECODING_ERRORS; tctx->decctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); return; } tctx->motion.refIdx[0] = ref_idx_l0; read_mvd_coding(tctx, x0, y0, 0); int mvp_l0_flag = decode_mvp_lx_flag(tctx); // l0 tctx->motion.mvp_l0_flag = mvp_l0_flag; logtrace(LogSlice, "prediction unit %d,%d, L0, refIdx=%d mvp_l0_flag:%d\n", x0, y0, tctx->motion.refIdx[0], mvp_l0_flag); } if (inter_pred_idc != PRED_L0) { int ref_idx_l1 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l1_active); // NOTE: case for only one reference frame is handles in decode_ref_idx_lX() if (ref_idx_l1 < 0 || ref_idx_l1 >= MAX_NUM_REF_PICS) { tctx->img->integrity = INTEGRITY_DECODING_ERRORS; tctx->decctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); return; } tctx->motion.refIdx[1] = ref_idx_l1; if (shdr->mvd_l1_zero_flag && inter_pred_idc == PRED_BI) { tctx->motion.mvd[1][0] = 0; tctx->motion.mvd[1][1] = 0; } else { read_mvd_coding(tctx, x0, y0, 1); } int mvp_l1_flag = decode_mvp_lx_flag(tctx); // l1 tctx->motion.mvp_l1_flag = mvp_l1_flag; logtrace(LogSlice, "prediction unit %d,%d, L1, refIdx=%d mvp_l1_flag:%d\n", x0, y0, tctx->motion.refIdx[1], mvp_l1_flag); } } decode_prediction_unit(tctx->decctx, tctx->shdr, tctx->img, tctx->motion, xC, yC, xB, yB, nCS, nPbW, nPbH, partIdx); } template void read_pcm_samples_internal(thread_context* tctx, int x0, int y0, int log2CbSize, int cIdx, bitreader& br) { const seq_parameter_set& sps = tctx->img->get_sps(); int nPcmBits; int bitDepth; int w = 1 << log2CbSize; int h = 1 << log2CbSize; if (cIdx > 0) { w /= sps.SubWidthC; h /= sps.SubHeightC; x0 /= sps.SubWidthC; y0 /= sps.SubHeightC; nPcmBits = sps.pcm_sample_bit_depth_chroma; bitDepth = sps.BitDepth_C; } else { nPcmBits = sps.pcm_sample_bit_depth_luma; bitDepth = sps.BitDepth_Y; } pixel_t* ptr; int stride; ptr = tctx->img->get_image_plane_at_pos_NEW(cIdx, x0, y0); stride = tctx->img->get_image_stride(cIdx); int shift = bitDepth - nPcmBits; // a shift < 0 may result when the SPS sequence header is broken if (shift < 0) { shift = 0; } for (int y = 0; y < h; y++) for (int x = 0; x < w; x++) { int value = br.get_bits(nPcmBits); ptr[y * stride + x] = value << shift; } } static void read_pcm_samples(thread_context* tctx, int x0, int y0, int log2CbSize) { bitreader br(tctx->cabac_decoder.bitstream_curr, tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr); if (tctx->img->high_bit_depth(0)) { read_pcm_samples_internal(tctx, x0, y0, log2CbSize, 0, br); } else { read_pcm_samples_internal(tctx, x0, y0, log2CbSize, 0, br); } if (tctx->img->get_sps().ChromaArrayType != CHROMA_MONO) { if (tctx->img->high_bit_depth(1)) { read_pcm_samples_internal(tctx, x0, y0, log2CbSize, 1, br); read_pcm_samples_internal(tctx, x0, y0, log2CbSize, 2, br); } else { read_pcm_samples_internal(tctx, x0, y0, log2CbSize, 1, br); read_pcm_samples_internal(tctx, x0, y0, log2CbSize, 2, br); } } br.prepare_for_CABAC(); tctx->cabac_decoder.bitstream_curr = br.data; tctx->cabac_decoder.init_CABAC(); } int map_chroma_pred_mode(int intra_chroma_pred_mode, int IntraPredMode) { if (intra_chroma_pred_mode == 4) { return IntraPredMode; } else { static const enum IntraPredMode IntraPredModeCCand[4] = { INTRA_PLANAR, INTRA_ANGULAR_26, // vertical INTRA_ANGULAR_10, // horizontal INTRA_DC }; int IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode]; if (IntraPredModeC == IntraPredMode) { return INTRA_ANGULAR_34; } else { return IntraPredModeC; } } } // h.265-V2 Table 8-3 static const uint8_t map_chroma_422[35] = { 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31 }; void read_coding_unit(thread_context* tctx, int x0, int y0, // position of coding unit in frame int log2CbSize, int ctDepth) { de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); const pic_parameter_set& pps = img->get_pps(); slice_segment_header* shdr = tctx->shdr; logtrace(LogSlice, "- read_coding_unit %d;%d cbsize:%d\n", x0, y0, 1 << log2CbSize); //QQprintf("- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<set_log2CbSize(x0, y0, log2CbSize, true); /* This is only required on corrupted input streams. It may happen that there are several slices in the image that overlap. In this case, flags would accumulate from both slices. */ img->clear_split_transform_flags(x0, y0, log2CbSize); int nCbS = 1 << log2CbSize; // number of coding block samples decode_quantization_parameters(tctx, x0, y0, x0, y0); if (pps.transquant_bypass_enable_flag) { int transquant_bypass = decode_transquant_bypass_flag(tctx); tctx->cu_transquant_bypass_flag = transquant_bypass; if (transquant_bypass) { img->set_cu_transquant_bypass(x0, y0, log2CbSize); } } else { tctx->cu_transquant_bypass_flag = 0; } uint8_t cu_skip_flag = 0; if (shdr->slice_type != SLICE_TYPE_I) { cu_skip_flag = decode_cu_skip_flag(tctx, x0, y0, ctDepth); } int IntraSplitFlag = 0; enum PredMode cuPredMode; if (cu_skip_flag) { read_prediction_unit_SKIP(tctx, x0, y0, nCbS, nCbS); img->set_PartMode(x0, y0, PART_2Nx2N); // need this for deblocking filter img->set_pred_mode(x0, y0, log2CbSize, MODE_SKIP); cuPredMode = MODE_SKIP; logtrace(LogSlice, "CU pred mode: SKIP\n"); // DECODE int nCS_L = 1 << log2CbSize; decode_prediction_unit(tctx->decctx, tctx->shdr, tctx->img, tctx->motion, x0, y0, 0, 0, nCS_L, nCS_L, nCS_L, 0); } else /* not skipped */ { if (shdr->slice_type != SLICE_TYPE_I) { int pred_mode_flag = decode_pred_mode_flag(tctx); cuPredMode = pred_mode_flag ? MODE_INTRA : MODE_INTER; } else { cuPredMode = MODE_INTRA; } img->set_pred_mode(x0, y0, log2CbSize, cuPredMode); logtrace(LogSlice, "CU pred mode: %s\n", cuPredMode == MODE_INTRA ? "INTRA" : "INTER"); enum PartMode PartMode; if (cuPredMode != MODE_INTRA || log2CbSize == sps.Log2MinCbSizeY) { PartMode = decode_part_mode(tctx, cuPredMode, log2CbSize); if (PartMode == PART_NxN && cuPredMode == MODE_INTRA) { IntraSplitFlag = 1; } } else { PartMode = PART_2Nx2N; } img->set_PartMode(x0, y0, PartMode); // needed for deblocking ? logtrace(LogSlice, "PartMode: %s\n", part_mode_name(PartMode)); bool pcm_flag = false; if (cuPredMode == MODE_INTRA) { if (PartMode == PART_2Nx2N && sps.pcm_enabled_flag && log2CbSize >= sps.Log2MinIpcmCbSizeY && log2CbSize <= sps.Log2MaxIpcmCbSizeY) { pcm_flag = tctx->cabac_decoder.decode_term_bit(); } if (pcm_flag) { img->set_pcm_flag(x0, y0, log2CbSize); read_pcm_samples(tctx, x0, y0, log2CbSize); } else { int pbOffset = (PartMode == PART_NxN) ? (nCbS / 2) : nCbS; int log2IntraPredSize = (PartMode == PART_NxN) ? (log2CbSize - 1) : log2CbSize; logtrace(LogSlice, "nCbS:%d pbOffset:%d\n", nCbS, pbOffset); int prev_intra_luma_pred_flag[4]; int idx = 0; for (int j = 0; j < nCbS; j += pbOffset) for (int i = 0; i < nCbS; i += pbOffset) { prev_intra_luma_pred_flag[idx++] = decode_prev_intra_luma_pred_flag(tctx); } int mpm_idx[4], rem_intra_luma_pred_mode[4]; idx = 0; int availableA0 = check_CTB_available(img, x0, y0, x0 - 1, y0); int availableB0 = check_CTB_available(img, x0, y0, x0, y0 - 1); for (int j = 0; j < nCbS; j += pbOffset) for (int i = 0; i < nCbS; i += pbOffset) { if (prev_intra_luma_pred_flag[idx]) { mpm_idx[idx] = decode_mpm_idx(tctx); } else { rem_intra_luma_pred_mode[idx] = decode_rem_intra_luma_pred_mode(tctx); } int x = x0 + i; int y = y0 + j; // --- find intra prediction mode --- int IntraPredMode; int availableA = availableA0 || (i > 0); // left candidate always available for right blk int availableB = availableB0 || (j > 0); // top candidate always available for bottom blk int PUidx = (x >> sps.Log2MinPUSize) + (y >> sps.Log2MinPUSize) * sps.PicWidthInMinPUs; enum IntraPredMode candModeList[3]; fillIntraPredModeCandidates(candModeList, x, y, PUidx, availableA, availableB, img); for (int i = 0; i < 3; i++) logtrace(LogSlice, "candModeList[%d] = %d\n", i, candModeList[i]); if (prev_intra_luma_pred_flag[idx] == 1) { IntraPredMode = candModeList[mpm_idx[idx]]; } else { // sort candModeList if (candModeList[0] > candModeList[1]) { std::swap(candModeList[0], candModeList[1]); } if (candModeList[0] > candModeList[2]) { std::swap(candModeList[0], candModeList[2]); } if (candModeList[1] > candModeList[2]) { std::swap(candModeList[1], candModeList[2]); } // skip modes in the list // (we have 35 modes. skipping the 3 in the list gives us 32, which can be selected by 5 bits) IntraPredMode = rem_intra_luma_pred_mode[idx]; for (int n = 0; n <= 2; n++) { if (IntraPredMode >= candModeList[n]) { IntraPredMode++; } } } logtrace(LogSlice, "IntraPredMode[%d][%d] = %d (log2blk:%d)\n", x, y, IntraPredMode, log2IntraPredSize); img->set_IntraPredMode(PUidx, log2IntraPredSize, (enum IntraPredMode) IntraPredMode); idx++; } // set chroma intra prediction mode if (sps.ChromaArrayType == CHROMA_444) { // chroma 4:4:4 idx = 0; for (int j = 0; j < nCbS; j += pbOffset) for (int i = 0; i < nCbS; i += pbOffset) { int x = x0 + i; int y = y0 + j; int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx); int IntraPredMode = img->get_IntraPredMode(x, y); int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode); logtrace(LogSlice, "IntraPredModeC[%d][%d]: %d (blksize:%d)\n", x, y, IntraPredModeC, 1 << log2IntraPredSize); img->set_IntraPredModeC(x, y, log2IntraPredSize, (enum IntraPredMode) IntraPredModeC, intra_chroma_pred_mode == 4); idx++; } } else if (sps.ChromaArrayType != CHROMA_MONO) { // chroma 4:2:0 and 4:2:2 int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx); int IntraPredMode = img->get_IntraPredMode(x0, y0); logtrace(LogSlice, "IntraPredMode: %d\n", IntraPredMode); int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode); if (sps.ChromaArrayType == CHROMA_422) { IntraPredModeC = map_chroma_422[IntraPredModeC]; } img->set_IntraPredModeC(x0, y0, log2CbSize, (enum IntraPredMode) IntraPredModeC, intra_chroma_pred_mode == 4); } } } else { // INTER int nCS = 1 << log2CbSize; if (PartMode == PART_2Nx2N) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS, nCbS, ctDepth, nCS, 0); } else if (PartMode == PART_2NxN) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS, nCbS / 2, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, 0, nCbS / 2, nCbS, nCbS / 2, ctDepth, nCS, 1); } else if (PartMode == PART_Nx2N) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS / 2, nCbS, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, nCbS / 2, 0, nCbS / 2, nCbS, ctDepth, nCS, 1); } else if (PartMode == PART_2NxnU) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS, nCbS / 4, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, 0, nCbS / 4, nCbS, nCbS * 3 / 4, ctDepth, nCS, 1); } else if (PartMode == PART_2NxnD) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS, nCbS * 3 / 4, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, 0, nCbS * 3 / 4, nCbS, nCbS / 4, ctDepth, nCS, 1); } else if (PartMode == PART_nLx2N) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS / 4, nCbS, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, nCbS / 4, 0, nCbS * 3 / 4, nCbS, ctDepth, nCS, 1); } else if (PartMode == PART_nRx2N) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS * 3 / 4, nCbS, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, nCbS * 3 / 4, 0, nCbS / 4, nCbS, ctDepth, nCS, 1); } else if (PartMode == PART_NxN) { read_prediction_unit(tctx, x0, y0, 0, 0, nCbS / 2, nCbS / 2, ctDepth, nCS, 0); read_prediction_unit(tctx, x0, y0, nCbS / 2, 0, nCbS / 2, nCbS / 2, ctDepth, nCS, 1); read_prediction_unit(tctx, x0, y0, 0, nCbS / 2, nCbS / 2, nCbS / 2, ctDepth, nCS, 2); read_prediction_unit(tctx, x0, y0, nCbS / 2, nCbS / 2, nCbS / 2, nCbS / 2, ctDepth, nCS, 3); } else { assert(0); // undefined PartMode } } // INTER // decode residual if (!pcm_flag) { // !pcm bool rqt_root_cbf; uint8_t merge_flag = tctx->motion.merge_flag; // !!get_merge_flag(ctx,x0,y0); if (cuPredMode != MODE_INTRA && !(PartMode == PART_2Nx2N && merge_flag)) { rqt_root_cbf = !!decode_rqt_root_cbf(tctx); } else { /* rqt_root_cbf=1 is inferred for Inter blocks with 2Nx2N, merge mode. These must be some residual data, because otherwise, the CB could also be coded in SKIP mode. */ rqt_root_cbf = true; } //set_rqt_root_cbf(ctx,x0,y0, log2CbSize, rqt_root_cbf); if (rqt_root_cbf) { int MaxTrafoDepth; if (cuPredMode == MODE_INTRA) { MaxTrafoDepth = sps.max_transform_hierarchy_depth_intra + IntraSplitFlag; } else { MaxTrafoDepth = sps.max_transform_hierarchy_depth_inter; } logtrace(LogSlice, "MaxTrafoDepth: %d\n", MaxTrafoDepth); uint8_t initial_chroma_cbf = 1; if (sps.ChromaArrayType == CHROMA_MONO) { initial_chroma_cbf = 0; } read_transform_tree(tctx, x0, y0, x0, y0, x0, y0, log2CbSize, 0, 0, MaxTrafoDepth, IntraSplitFlag, cuPredMode, initial_chroma_cbf, initial_chroma_cbf); } } // !pcm } } // ------------------------------------------------------------------------------------------ void read_coding_quadtree(thread_context* tctx, int x0, int y0, int log2CbSize, int ctDepth) { logtrace(LogSlice, "- read_coding_quadtree %d;%d cbsize:%d depth:%d POC:%d\n", x0, y0, 1 << log2CbSize, ctDepth, tctx->img->PicOrderCntVal); de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); const pic_parameter_set& pps = img->get_pps(); int split_flag; // We only send a split flag if CU is larger than minimum size and // completely contained within the image area. // If it is partly outside the image area and not at minimum size, // it is split. If already at minimum size, it is not split further. if (x0 + (1 << log2CbSize) <= sps.pic_width_in_luma_samples && y0 + (1 << log2CbSize) <= sps.pic_height_in_luma_samples && log2CbSize > sps.Log2MinCbSizeY) { split_flag = decode_split_cu_flag(tctx, x0, y0, ctDepth); } else { if (log2CbSize > sps.Log2MinCbSizeY) { split_flag = 1; } else { split_flag = 0; } } if (pps.cu_qp_delta_enabled_flag && log2CbSize >= pps.Log2MinCuQpDeltaSize) { tctx->IsCuQpDeltaCoded = 0; tctx->CuQpDelta = 0; } else { // shdr->CuQpDelta = 0; // TODO check: is this the right place to set to default value ? } if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && log2CbSize >= pps.Log2MinCuChromaQpOffsetSize) { tctx->IsCuChromaQpOffsetCoded = 0; } if (split_flag) { int x1 = x0 + (1 << (log2CbSize - 1)); int y1 = y0 + (1 << (log2CbSize - 1)); read_coding_quadtree(tctx, x0, y0, log2CbSize - 1, ctDepth + 1); if (x1 < sps.pic_width_in_luma_samples) read_coding_quadtree(tctx, x1, y0, log2CbSize - 1, ctDepth + 1); if (y1 < sps.pic_height_in_luma_samples) read_coding_quadtree(tctx, x0, y1, log2CbSize - 1, ctDepth + 1); if (x1 < sps.pic_width_in_luma_samples && y1 < sps.pic_height_in_luma_samples) read_coding_quadtree(tctx, x1, y1, log2CbSize - 1, ctDepth + 1); } else { // set ctDepth of this CU img->set_ctDepth(x0, y0, log2CbSize, ctDepth); read_coding_unit(tctx, x0, y0, log2CbSize, ctDepth); } logtrace(LogSlice, "-\n"); } // --------------------------------------------------------------------------- enum DecodeResult { Decode_EndOfSliceSegment, Decode_EndOfSubstream, Decode_Error }; /* Decode CTBs until the end of sub-stream, the end-of-slice, or some error occurs. */ enum DecodeResult decode_substream(thread_context* tctx, bool block_wpp, // block on WPP dependencies bool first_independent_substream) { const pic_parameter_set& pps = tctx->img->get_pps(); const seq_parameter_set& sps = tctx->img->get_sps(); const uint16_t ctbW = sps.PicWidthInCtbsY; const uint16_t startCtbY = tctx->CtbY; //printf("start decoding substream at %d;%d\n",tctx->CtbX,tctx->CtbY); // in WPP mode: initialize CABAC model with stored model from row above if ((!first_independent_substream || tctx->CtbY != startCtbY) && pps.entropy_coding_sync_enabled_flag && tctx->CtbY >= 1 && tctx->CtbX == 0) { if (sps.PicWidthInCtbsY > 1) { assert(tctx->CtbY >= 1); if (static_cast(tctx->CtbY - 1) >= tctx->imgunit->ctx_models.size()) { return Decode_Error; } //printf("CTX wait on %d/%d\n",1,tctx->CtbY-1); // we have to wait until the context model data is there tctx->img->wait_for_progress(tctx->task, 1, tctx->CtbY - 1,CTB_PROGRESS_PREFILTER); // copy CABAC model from previous CTB row tctx->ctx_model = tctx->imgunit->ctx_models[(tctx->CtbY - 1)]; tctx->imgunit->ctx_models[(tctx->CtbY - 1)].release(); // not used anymore } else { tctx->img->wait_for_progress(tctx->task, 0, tctx->CtbY - 1,CTB_PROGRESS_PREFILTER); initialize_CABAC_models(tctx); } } do { const uint32_t ctbx = tctx->CtbX; const uint32_t ctby = tctx->CtbY; if (ctbx + ctby * ctbW >= pps.CtbAddrRStoTS.size()) { return Decode_Error; } if (ctbx >= sps.PicWidthInCtbsY || ctby >= sps.PicHeightInCtbsY) { return Decode_Error; } if (block_wpp && ctby > 0 && ctbx + 1 < ctbW) { // TODO: if we are in tiles mode and at the right border, do not wait for x+1,y-1 //printf("wait on %d/%d (%d)\n",ctbx+1,ctby-1, ctbx+1+(ctby-1)*sps->PicWidthInCtbsY); tctx->img->wait_for_progress(tctx->task, ctbx + 1, ctby - 1, CTB_PROGRESS_PREFILTER); } //printf("%p: decode %d;%d\n", tctx, tctx->CtbX,tctx->CtbY); // read and decode CTB if (tctx->ctx_model.empty() == false) { return Decode_Error; } read_coding_tree_unit(tctx); // save CABAC-model for WPP (except in last CTB row) if (pps.entropy_coding_sync_enabled_flag && ctbx == 1 && ctby + 1 < sps.PicHeightInCtbsY) { // no storage for context table has been allocated if (tctx->imgunit->ctx_models.size() <= ctby) { return Decode_Error; } tctx->imgunit->ctx_models[ctby] = tctx->ctx_model; tctx->imgunit->ctx_models[ctby].decouple(); // store an independent copy } // end of slice segment ? int end_of_slice_segment_flag = tctx->cabac_decoder.decode_term_bit(); //printf("end-of-slice flag: %d\n", end_of_slice_segment_flag); if (end_of_slice_segment_flag) { // at the end of the slice segment, we store the CABAC model if we need it // because a dependent slice may follow if (pps.dependent_slice_segments_enabled_flag) { tctx->shdr->ctx_model_storage = tctx->ctx_model; tctx->shdr->ctx_model_storage.decouple(); // store an independent copy tctx->shdr->ctx_model_storage_defined = true; } } tctx->img->ctb_progress[ctbx + ctby * ctbW].set_progress(CTB_PROGRESS_PREFILTER); //printf("%p: decoded %d|%d\n",tctx, ctby,ctbx); logtrace(LogSlice, "read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag); //printf("read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag); const int lastCtbY = tctx->CtbY; bool endOfPicture = advanceCtbAddr(tctx); // true if we read past the end of the image if (endOfPicture && end_of_slice_segment_flag == false) { tctx->decctx->add_warning(DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA, false); tctx->img->integrity = INTEGRITY_DECODING_ERRORS; return Decode_Error; } if (end_of_slice_segment_flag) { /* corrupted inputs may send the end_of_slice_segment_flag even if not all CTBs in a row have been coded. Hence, we mark all of them as finished. */ /* for (int x = ctbx+1 ; xPicWidthInCtbsY; x++) { printf("mark skipped %d;%d\n",ctbx,ctby); tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER); } */ return Decode_EndOfSliceSegment; } if (!end_of_slice_segment_flag) { bool end_of_sub_stream = false; end_of_sub_stream |= (pps.tiles_enabled_flag && pps.TileId[tctx->CtbAddrInTS] != pps.TileId[tctx->CtbAddrInTS - 1]); end_of_sub_stream |= (pps.entropy_coding_sync_enabled_flag && lastCtbY != tctx->CtbY); if (end_of_sub_stream) { int end_of_sub_stream_one_bit = tctx->cabac_decoder.decode_term_bit(); if (!end_of_sub_stream_one_bit) { tctx->decctx->add_warning(DE265_WARNING_EOSS_BIT_NOT_SET, false); tctx->img->integrity = INTEGRITY_DECODING_ERRORS; return Decode_Error; } tctx->cabac_decoder.init_CABAC(); // byte alignment return Decode_EndOfSubstream; } } } while (true); } bool initialize_CABAC_at_slice_segment_start(thread_context* tctx) { de265_image* img = tctx->img; const pic_parameter_set& pps = img->get_pps(); const seq_parameter_set& sps = img->get_sps(); slice_segment_header* shdr = tctx->shdr; if (shdr->dependent_slice_segment_flag) { int prevCtb = pps.CtbAddrTStoRS[pps.CtbAddrRStoTS[shdr->slice_segment_address] - 1]; uint16_t sliceIdx = img->get_SliceHeaderIndex_atIndex(prevCtb); if (sliceIdx >= img->slices.size()) { return false; } slice_segment_header* prevCtbHdr = img->slices[sliceIdx]; if (pps.is_tile_start_CTB(shdr->slice_segment_address % sps.PicWidthInCtbsY, shdr->slice_segment_address / sps.PicWidthInCtbsY )) { initialize_CABAC_models(tctx); } else { // wait for previous slice to finish decoding //printf("wait for previous slice to finish decoding\n"); slice_unit* prevSliceSegment = tctx->imgunit->get_prev_slice_segment(tctx->sliceunit); //assert(prevSliceSegment); if (prevSliceSegment == nullptr) { return false; } prevSliceSegment->finished_threads.wait_for_progress(prevSliceSegment->nThreads); /* printf("wait for %d,%d (init)\n", prevCtb / sps->PicWidthInCtbsY, prevCtb % sps->PicWidthInCtbsY); tctx->img->wait_for_progress(tctx->task, prevCtb, CTB_PROGRESS_PREFILTER); */ if (!prevCtbHdr->ctx_model_storage_defined) { return false; } tctx->ctx_model = prevCtbHdr->ctx_model_storage; prevCtbHdr->ctx_model_storage.release(); } } else { initialize_CABAC_models(tctx); } return true; } std::string thread_task_ctb_row::name() const { char buf[100]; sprintf(buf, "ctb-row-%d", debug_startCtbRow); return buf; } std::string thread_task_slice_segment::name() const { char buf[100]; sprintf(buf, "slice-segment-%d;%d", debug_startCtbX, debug_startCtbY); return buf; } void thread_task_slice_segment::work() { thread_task_slice_segment* data = this; thread_context* tctx = data->tctx; de265_image* img = tctx->img; state = Running; img->thread_run(this); setCtbAddrFromTS(tctx); //printf("%p: A start decoding at %d/%d\n", tctx, tctx->CtbX,tctx->CtbY); if (data->firstSliceSubstream) { bool success = initialize_CABAC_at_slice_segment_start(tctx); if (!success) { state = Finished; tctx->sliceunit->finished_threads.increase_progress(1); img->thread_finishes(this); return; } } else { initialize_CABAC_models(tctx); } tctx->cabac_decoder.init_CABAC(); /*enum DecodeResult result =*/ decode_substream(tctx, false, data->firstSliceSubstream); state = Finished; tctx->sliceunit->finished_threads.increase_progress(1); img->thread_finishes(this); return; // DE265_OK; } void thread_task_ctb_row::work() { thread_task_ctb_row* data = this; thread_context* tctx = data->tctx; de265_image* img = tctx->img; const seq_parameter_set& sps = img->get_sps(); int ctbW = sps.PicWidthInCtbsY; state = Running; img->thread_run(this); setCtbAddrFromTS(tctx); int ctby = tctx->CtbAddrInRS / ctbW; int myCtbRow = ctby; //printf("start CTB-row decoding at row %d\n", ctby); if (data->firstSliceSubstream) { bool success = initialize_CABAC_at_slice_segment_start(tctx); if (!success) { // could not decode this row, mark whole row as finished for (int x = 0; x < ctbW; x++) { img->ctb_progress[myCtbRow * ctbW + x].set_progress(CTB_PROGRESS_PREFILTER); } state = Finished; tctx->sliceunit->finished_threads.increase_progress(1); img->thread_finishes(this); return; } //initialize_CABAC(tctx); } tctx->cabac_decoder.init_CABAC(); bool firstIndependentSubstream = data->firstSliceSubstream && !tctx->shdr->dependent_slice_segment_flag; /*enum DecodeResult result =*/ decode_substream(tctx, true, firstIndependentSubstream); // mark progress on remaining CTBs in row (in case of decoder error and early termination) // TODO: what about slices that end properly in the middle of a CTB row? if (tctx->CtbY == myCtbRow) { int lastCtbX = sps.PicWidthInCtbsY; // assume no tiles when WPP is on for (int x = tctx->CtbX; x < lastCtbX; x++) { if (x < sps.PicWidthInCtbsY && myCtbRow < sps.PicHeightInCtbsY) { img->ctb_progress[myCtbRow * ctbW + x].set_progress(CTB_PROGRESS_PREFILTER); } } } state = Finished; tctx->sliceunit->finished_threads.increase_progress(1); img->thread_finishes(this); } de265_error read_slice_segment_data(thread_context* tctx) { setCtbAddrFromTS(tctx); de265_image* img = tctx->img; const pic_parameter_set& pps = img->get_pps(); //const seq_parameter_set& sps = img->get_sps(); slice_segment_header* shdr = tctx->shdr; bool success = initialize_CABAC_at_slice_segment_start(tctx); if (!success) { return DE265_ERROR_UNSPECIFIED_DECODING_ERROR; } tctx->cabac_decoder.init_CABAC(); //printf("-----\n"); bool first_slice_substream = !shdr->dependent_slice_segment_flag; uint32_t substream = 0; enum DecodeResult result; do { //int ctby = tctx->CtbY; // check whether entry_points[] are correct in the bitstream if (substream > 0) { if (substream - 1 >= tctx->shdr->entry_point_offset.size() || tctx->cabac_decoder.bitstream_curr - tctx->cabac_decoder.bitstream_start - 2 /* -2 because of CABAC init */ != tctx->shdr->entry_point_offset[substream - 1]) { tctx->decctx->add_warning(DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET, true); } } substream++; result = decode_substream(tctx, false, first_slice_substream); if (result == Decode_EndOfSliceSegment || result == Decode_Error) { break; } first_slice_substream = false; if (pps.tiles_enabled_flag) { initialize_CABAC_models(tctx); } } while (true); return DE265_OK; } /* TODO: When a task wants to block, but is the first in the list of pending tasks, do some error concealment instead of blocking, since it will never be deblocked. This will only happen in the case of input error. */ libde265-1.0.18/libde265/slice.h000066400000000000000000000224211515675107500157160ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * Authors: struktur AG, Dirk Farin * Min Chen * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_SLICE_H #define DE265_SLICE_H #include "libde265/cabac.h" #include "libde265/de265.h" #include "libde265/util.h" #include "libde265/refpic.h" #include "libde265/threads.h" #include "contextmodel.h" #include #include #include class decoder_context; class thread_context; class error_queue; class seq_parameter_set; class pic_parameter_set; enum SliceType { SLICE_TYPE_B = 0, SLICE_TYPE_P = 1, SLICE_TYPE_I = 2 }; /* 2Nx2N 2NxN Nx2N NxN +-------+ +-------+ +---+---+ +---+---+ | | | | | | | | | | | | |_______| | | | |___|___| | | | | | | | | | | | | | | | | | | | | +-------+ +-------+ +---+---+ +---+---+ 2NxnU 2NxnD nLx2N nRx2N +-------+ +-------+ +-+-----+ +-----+-+ |_______| | | | | | | | | | | | | | | | | | | | | |_______| | | | | | | | | | | | | | | | | +-------+ +-------+ +-+-----+ +-----+-+ - AMP only if CU size > min CU size -> minimum PU size = CUsize/2 - NxN only if size >= 16x16 (-> minimum block size = 8x8) - minimum block size for Bi-Pred is 8x8 (wikipedia: Coding_tree_unit) */ enum PartMode { PART_2Nx2N = 0, PART_2NxN = 1, PART_Nx2N = 2, PART_NxN = 3, PART_2NxnU = 4, PART_2NxnD = 5, PART_nLx2N = 6, PART_nRx2N = 7 }; const char* part_mode_name(PartMode); enum PredMode { MODE_INTRA, MODE_INTER, MODE_SKIP }; enum IntraPredMode { INTRA_PLANAR = 0, INTRA_DC = 1, INTRA_ANGULAR_2 = 2, INTRA_ANGULAR_3 = 3, INTRA_ANGULAR_4 = 4, INTRA_ANGULAR_5 = 5, INTRA_ANGULAR_6 = 6, INTRA_ANGULAR_7 = 7, INTRA_ANGULAR_8 = 8, INTRA_ANGULAR_9 = 9, INTRA_ANGULAR_10 = 10, INTRA_ANGULAR_11 = 11, INTRA_ANGULAR_12 = 12, INTRA_ANGULAR_13 = 13, INTRA_ANGULAR_14 = 14, INTRA_ANGULAR_15 = 15, INTRA_ANGULAR_16 = 16, INTRA_ANGULAR_17 = 17, INTRA_ANGULAR_18 = 18, INTRA_ANGULAR_19 = 19, INTRA_ANGULAR_20 = 20, INTRA_ANGULAR_21 = 21, INTRA_ANGULAR_22 = 22, INTRA_ANGULAR_23 = 23, INTRA_ANGULAR_24 = 24, INTRA_ANGULAR_25 = 25, INTRA_ANGULAR_26 = 26, INTRA_ANGULAR_27 = 27, INTRA_ANGULAR_28 = 28, INTRA_ANGULAR_29 = 29, INTRA_ANGULAR_30 = 30, INTRA_ANGULAR_31 = 31, INTRA_ANGULAR_32 = 32, INTRA_ANGULAR_33 = 33, INTRA_ANGULAR_34 = 34 }; enum IntraChromaPredMode { INTRA_CHROMA_PLANAR_OR_34 = 0, INTRA_CHROMA_ANGULAR_26_OR_34 = 1, INTRA_CHROMA_ANGULAR_10_OR_34 = 2, INTRA_CHROMA_DC_OR_34 = 3, INTRA_CHROMA_LIKE_LUMA = 4 }; enum InterPredIdc { // note: values have to match the decoding function decode_inter_pred_idc() PRED_L0=1, PRED_L1=2, PRED_BI=3 }; class slice_segment_header { public: slice_segment_header() { reset(); } de265_error read(bitreader* br, decoder_context*, bool* continueDecoding); de265_error write(error_queue*, CABAC_encoder&, const seq_parameter_set* sps, const pic_parameter_set* pps, uint8_t nal_unit_type); void dump_slice_segment_header(const decoder_context*, int fd) const; void set_defaults(); void reset(); int slice_index; // index through all slices in a picture (internal only) std::shared_ptr pps; bool first_slice_segment_in_pic_flag; bool no_output_of_prior_pics_flag; uint8_t slice_pic_parameter_set_id; // [0;63] bool dependent_slice_segment_flag; uint32_t slice_segment_address; uint8_t slice_type; // [0;2] bool pic_output_flag; char colour_plane_id; int slice_pic_order_cnt_lsb; bool short_term_ref_pic_set_sps_flag; ref_pic_set slice_ref_pic_set; uint8_t short_term_ref_pic_set_idx; uint8_t num_long_term_sps; // [0;32] uint8_t num_long_term_pics; // [0;32] uint8_t lt_idx_sps[MAX_NUM_REF_PICS]; int poc_lsb_lt[MAX_NUM_REF_PICS]; bool used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS]; bool delta_poc_msb_present_flag[MAX_NUM_REF_PICS]; uint32_t delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS]; bool slice_temporal_mvp_enabled_flag; bool slice_sao_luma_flag; bool slice_sao_chroma_flag; bool num_ref_idx_active_override_flag; uint8_t num_ref_idx_l0_active; // [1;16] uint8_t num_ref_idx_l1_active; // [1;16] bool ref_pic_list_modification_flag_l0; bool ref_pic_list_modification_flag_l1; uint8_t list_entry_l0[16]; uint8_t list_entry_l1[16]; bool mvd_l1_zero_flag; bool cabac_init_flag; bool collocated_from_l0_flag; uint8_t collocated_ref_idx; // [0;15] // --- pred_weight_table --- uint8_t luma_log2_weight_denom; // [0;7] uint8_t ChromaLog2WeightDenom; // [0;7] // first index is L0/L1 uint8_t luma_weight_flag[2][16]; // bool uint8_t chroma_weight_flag[2][16]; // bool int16_t LumaWeight[2][16]; int16_t luma_offset[2][16]; int16_t ChromaWeight[2][16][2]; int16_t ChromaOffset[2][16][2]; uint8_t five_minus_max_num_merge_cand; // [0;5] int slice_qp_delta; int slice_cb_qp_offset; int slice_cr_qp_offset; bool cu_chroma_qp_offset_enabled_flag; bool deblocking_filter_override_flag; bool slice_deblocking_filter_disabled_flag; int8_t slice_beta_offset; // [-12;12], = pps->beta_offset if undefined int8_t slice_tc_offset; // [-12;12], = pps->tc_offset if undefined bool slice_loop_filter_across_slices_enabled_flag; int num_entry_point_offsets; int offset_len; std::vector entry_point_offset; int slice_segment_header_extension_length; // --- derived data --- int SliceQPY; int initType; void compute_derived_values(const pic_parameter_set* pps); // --- data for external modules --- uint32_t SliceAddrRS; // slice_segment_address of last independent slice int MaxNumMergeCand; // directly derived from 'five_minus_max_num_merge_cand' int CurrRpsIdx; ref_pic_set CurrRps; // the active reference-picture set int NumPocTotalCurr; // number of entries: num_ref_idx_l0_active / num_ref_idx_l1_active int RefPicList[2][MAX_NUM_REF_PICS]; // contains buffer IDs (D:indices into DPB/E:frame number) int RefPicList_POC[2][MAX_NUM_REF_PICS]; int RefPicList_PicState[2][MAX_NUM_REF_PICS]; /* We have to save the PicState because the decoding of an image may be delayed and the PicState can change in the mean-time (e.g. from ShortTerm to LongTerm). PicState is used in motion.cc */ bool LongTermRefPic[2][MAX_NUM_REF_PICS]; /* Flag whether the picture at this ref-pic-list is a long-term picture. */ // context storage for dependent slices (stores CABAC model at end of slice segment) context_model_table ctx_model_storage; bool ctx_model_storage_defined; // whether there is valid data in ctx_model_storage std::vector RemoveReferencesList; // images that can be removed from the DPB before decoding this slice }; struct sao_info { // TODO: we could combine SaoTypeIdx and SaoEoClass into one byte to make the struct 16 bytes only unsigned char SaoTypeIdx; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3 unsigned char SaoEoClass; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3 uint8_t sao_band_position[3]; int8_t saoOffsetVal[3][4]; // index with [][idx-1] as saoOffsetVal[][0]==0 always }; de265_error read_slice_segment_data(thread_context* tctx); bool alloc_and_init_significant_coeff_ctxIdx_lookupTable(); void free_significant_coeff_ctxIdx_lookupTable(); class thread_task_ctb_row : public thread_task { public: bool firstSliceSubstream; int debug_startCtbRow; thread_context* tctx; void work() override; std::string name() const override; }; class thread_task_slice_segment : public thread_task { public: bool firstSliceSubstream; int debug_startCtbX, debug_startCtbY; thread_context* tctx; void work() override; std::string name() const override; }; int check_CTB_available(const de265_image* img, int xC,int yC, int xN,int yN); #endif libde265-1.0.18/libde265/sps.cc000066400000000000000000001275621515675107500155760ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "sps.h" #include "util.h" #include "scan.h" #include "decctx.h" #include #include #include #define D 0 #define READ_VLC(variable, vlctype) \ if ((vlc = br->get_ ## vlctype()) == UVLC_ERROR) { \ errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \ return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \ } \ variable = vlc; static int SubWidthC_tab[] = { 1,2,2,1 }; static int SubHeightC_tab[] = { 1,2,1,1 }; // TODO if (!check_high(ctx, vlc, 15)) return false; // TODO if (!check_ulvc(ctx, vlc)) return false; // TODO: should be in some header-file of refpic.c extern bool read_short_term_ref_pic_set(error_queue* errqueue, const seq_parameter_set* sps, bitreader* br, ref_pic_set* out_set, int idxRps, // index of the set to be read const std::vector& sets, bool sliceRefPicSet); extern bool write_short_term_ref_pic_set(error_queue* errqueue, const seq_parameter_set* sps, CABAC_encoder& out, const ref_pic_set* in_set, // which set to write int idxRps, // index of the set to be read const std::vector& sets, // previously read sets bool sliceRefPicSet); // is this in the slice header? sps_range_extension::sps_range_extension() = default; seq_parameter_set::seq_parameter_set() = default; seq_parameter_set::~seq_parameter_set() { //free(ref_pic_sets); } void seq_parameter_set::set_defaults(enum PresetSet) { video_parameter_set_id = 0; sps_max_sub_layers = 1; sps_temporal_id_nesting_flag = 1; profile_tier_level_.general.set_defaults(Profile_Main, 6,2); // TODO seq_parameter_set_id = 0; chroma_format_idc = 1; ChromaArrayType = chroma_format_idc; separate_colour_plane_flag = 0; pic_width_in_luma_samples = 0; pic_height_in_luma_samples = 0; conformance_window_flag = 0; conf_win_left_offset = 0; conf_win_right_offset = 0; conf_win_top_offset = 0; conf_win_bottom_offset = 0; bit_depth_luma =8; bit_depth_chroma=8; log2_max_pic_order_cnt_lsb = 8; sps_sub_layer_ordering_info_present_flag = 0; sps_max_dec_pic_buffering[0] = 1; sps_max_num_reorder_pics[0] = 0; sps_max_latency_increase_plus1[0] = 0; set_CB_log2size_range(4,4); set_TB_log2size_range(3,4); max_transform_hierarchy_depth_inter = 1; max_transform_hierarchy_depth_intra = 1; scaling_list_enable_flag = 0; sps_scaling_list_data_present_flag = 0; // TODO struct scaling_list_data scaling_list; amp_enabled_flag = 0; sample_adaptive_offset_enabled_flag = 0; pcm_enabled_flag = 0; pcm_sample_bit_depth_luma = 8; pcm_sample_bit_depth_chroma = 8; // TODO log2_min_pcm_luma_coding_block_size; // TODO log2_diff_max_min_pcm_luma_coding_block_size; pcm_loop_filter_disable_flag = 1; // num_short_term_ref_pic_sets = 0; // std::vector ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) ) ref_pic_sets.clear(); long_term_ref_pics_present_flag = 0; num_long_term_ref_pics_sps = 0; /* TODO int lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS]; char used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS]; */ sps_temporal_mvp_enabled_flag = 0; strong_intra_smoothing_enable_flag = 0; vui_parameters_present_flag = 0; /* if( vui_parameters_present_flag ) vui_parameters() */ sps_extension_present_flag = 0; sps_range_extension_flag = 0; sps_multilayer_extension_flag = 0; sps_extension_6bits = 0; } void seq_parameter_set::set_CB_log2size_range(int mini,int maxi) { log2_min_luma_coding_block_size = mini; log2_diff_max_min_luma_coding_block_size = maxi-mini; } void seq_parameter_set::set_TB_log2size_range(int mini,int maxi) { log2_min_transform_block_size = mini; log2_diff_max_min_transform_block_size = maxi-mini; } void seq_parameter_set::set_resolution(int w,int h) { pic_width_in_luma_samples = w; pic_height_in_luma_samples = h; } de265_error seq_parameter_set::read(error_queue* errqueue, bitreader* br) { uint32_t vlc; video_parameter_set_id = br->get_bits(4); sps_max_sub_layers = br->get_bits(3) +1; if (sps_max_sub_layers>7) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } sps_temporal_id_nesting_flag = br->get_bits(1); profile_tier_level_.read(br, sps_max_sub_layers); if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc >= DE265_MAX_SPS_SETS) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } seq_parameter_set_id = vlc; // --- decode chroma type --- if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 3) { errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } chroma_format_idc = vlc; if (chroma_format_idc == 3) { separate_colour_plane_flag = br->get_bits(1); } else { separate_colour_plane_flag = 0; } // --- picture size --- if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc == 0 || vlc > MAX_PICTURE_WIDTH) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } pic_width_in_luma_samples = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc == 0 || vlc > MAX_PICTURE_HEIGHT) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } pic_height_in_luma_samples = vlc; conformance_window_flag = br->get_bits(1); if (conformance_window_flag) { if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc >= static_cast(pic_width_in_luma_samples)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } conf_win_left_offset = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc + conf_win_left_offset >= static_cast(pic_width_in_luma_samples)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } conf_win_right_offset = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc >= static_cast(pic_height_in_luma_samples)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } conf_win_top_offset = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc + conf_win_top_offset >= static_cast(pic_height_in_luma_samples)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } conf_win_bottom_offset = vlc; } else { conf_win_left_offset = 0; conf_win_right_offset = 0; conf_win_top_offset = 0; conf_win_bottom_offset= 0; } if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 8) { errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } bit_depth_luma = vlc + 8; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 8) { errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } bit_depth_chroma = vlc + 8; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 12) { errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_max_pic_order_cnt_lsb = vlc + 4; MaxPicOrderCntLsb = 1<<(log2_max_pic_order_cnt_lsb); // --- sub_layer_ordering_info --- sps_sub_layer_ordering_info_present_flag = br->get_bits(1); int firstLayer = (sps_sub_layer_ordering_info_present_flag ? 0 : sps_max_sub_layers-1 ); for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { // sps_max_dec_pic_buffering[i] vlc=br->get_uvlc(); if (vlc == UVLC_ERROR || vlc+1 > MAX_NUM_REF_PICS) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } sps_max_dec_pic_buffering[i] = vlc+1; // sps_max_num_reorder_pics[i] if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > sps_max_dec_pic_buffering[i]) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } sps_max_num_reorder_pics[i] = vlc; // sps_max_latency_increase[i] READ_VLC(sps_max_latency_increase_plus1[i], uvlc); sps_max_latency_increase_present[i] = (sps_max_latency_increase_plus1[i] != 0); if (sps_max_latency_increase_present[i]) { SpsMaxLatencyPictures[i] = (sps_max_num_reorder_pics[i] + sps_max_latency_increase_plus1[i] - 1); } } // copy info to all layers if only specified once if (sps_sub_layer_ordering_info_present_flag) { int ref = sps_max_sub_layers-1; assert(ref<7); for (int i=0 ; i < sps_max_sub_layers-1; i++ ) { sps_max_dec_pic_buffering[i] = sps_max_dec_pic_buffering[ref]; sps_max_num_reorder_pics[i] = sps_max_num_reorder_pics[ref]; sps_max_latency_increase_plus1[i] = sps_max_latency_increase_plus1[ref]; sps_max_latency_increase_present[i] = sps_max_latency_increase_present[ref]; SpsMaxLatencyPictures[i] = SpsMaxLatencyPictures[ref]; } } if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 3) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_min_luma_coding_block_size = vlc + 3; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > static_cast(6 - log2_min_luma_coding_block_size)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_diff_max_min_luma_coding_block_size = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 3) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_min_transform_block_size = vlc + 2; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > static_cast(5 - log2_min_transform_block_size)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_diff_max_min_transform_block_size = vlc; // log2_min_transform_block_size must not exceed the max coding block size (Log2CtbSizeY) if (log2_min_transform_block_size > log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } uint32_t maxDepth = log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size - log2_min_transform_block_size; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > maxDepth) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } max_transform_hierarchy_depth_inter = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > maxDepth) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } max_transform_hierarchy_depth_intra = vlc; scaling_list_enable_flag = br->get_bits(1); if (scaling_list_enable_flag) { sps_scaling_list_data_present_flag = br->get_bits(1); if (sps_scaling_list_data_present_flag) { de265_error err; if ((err=read_scaling_list(br,this, &scaling_list, false)) != DE265_OK) { return err; } } else { set_default_scaling_lists(&scaling_list); } } amp_enabled_flag = br->get_bits(1); sample_adaptive_offset_enabled_flag = br->get_bits(1); pcm_enabled_flag = br->get_bits(1); if (pcm_enabled_flag) { pcm_sample_bit_depth_luma = br->get_bits(4)+1; pcm_sample_bit_depth_chroma = br->get_bits(4)+1; int log2PcmCbSizeMax = std::min(static_cast(log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size), 5); if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc + 3 > static_cast(log2PcmCbSizeMax)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_min_pcm_luma_coding_block_size = vlc + 3; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > static_cast(log2PcmCbSizeMax - log2_min_pcm_luma_coding_block_size)) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_diff_max_min_pcm_luma_coding_block_size = vlc; pcm_loop_filter_disable_flag = br->get_bits(1); if (pcm_sample_bit_depth_luma > bit_depth_luma) { errqueue->add_warning(DE265_WARNING_PCM_BITDEPTH_TOO_LARGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (pcm_sample_bit_depth_chroma > bit_depth_chroma) { errqueue->add_warning(DE265_WARNING_PCM_BITDEPTH_TOO_LARGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } } else { pcm_sample_bit_depth_luma = 0; pcm_sample_bit_depth_chroma = 0; log2_min_pcm_luma_coding_block_size = 0; log2_diff_max_min_pcm_luma_coding_block_size = 0; pcm_loop_filter_disable_flag = 0; } int num_short_term_ref_pic_sets; READ_VLC(num_short_term_ref_pic_sets, uvlc); if (num_short_term_ref_pic_sets < 0 || num_short_term_ref_pic_sets > 64) { errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } // --- allocate reference pic set --- // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself ref_pic_sets.resize(num_short_term_ref_pic_sets); for (int i = 0; i < num_short_term_ref_pic_sets; i++) { bool success = read_short_term_ref_pic_set(errqueue,this,br, &ref_pic_sets[i], i, ref_pic_sets, false); if (!success) { return DE265_WARNING_SPS_HEADER_INVALID; } // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh); } long_term_ref_pics_present_flag = br->get_bits(1); if (long_term_ref_pics_present_flag) { if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > MAX_NUM_LT_REF_PICS_SPS) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } num_long_term_ref_pics_sps = vlc; for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { lt_ref_pic_poc_lsb_sps[i] = br->get_bits(log2_max_pic_order_cnt_lsb); used_by_curr_pic_lt_sps_flag[i] = br->get_bits(1); } } else { num_long_term_ref_pics_sps = 0; // NOTE: missing definition in standard ! } sps_temporal_mvp_enabled_flag = br->get_bits(1); strong_intra_smoothing_enable_flag = br->get_bits(1); vui_parameters_present_flag = br->get_bits(1); if (vui_parameters_present_flag) { de265_error err = vui.read(errqueue, br, this); if (err) { return err; } } sps_extension_present_flag = br->get_bits(1); if (sps_extension_present_flag) { sps_range_extension_flag = br->get_bits(1); sps_multilayer_extension_flag = br->get_bits(1); sps_extension_6bits = br->get_bits(6); } else { sps_range_extension_flag = 0; } if (sps_range_extension_flag) { de265_error err = range_extension.read(errqueue, br); if (err != DE265_OK) { return err; } } /* sps_extension_flag = br->get_bits(1); if (sps_extension_flag) { assert(false); } */ de265_error err = compute_derived_values(); if (err != DE265_OK) { return err; } sps_read = true; return DE265_OK; } de265_error seq_parameter_set::compute_derived_values(bool sanitize_values) { // --- compute derived values --- SubWidthC = SubWidthC_tab [chroma_format_idc]; SubHeightC = SubHeightC_tab[chroma_format_idc]; if (separate_colour_plane_flag) { ChromaArrayType = 0; } else { ChromaArrayType = chroma_format_idc; } if (ChromaArrayType==0) { WinUnitX = 1; WinUnitY = 1; } else { WinUnitX = SubWidthC_tab [chroma_format_idc]; WinUnitY = SubHeightC_tab[chroma_format_idc]; } BitDepth_Y = bit_depth_luma; QpBdOffset_Y = 6*(bit_depth_luma-8); BitDepth_C = bit_depth_chroma; QpBdOffset_C = 6*(bit_depth_chroma-8); Log2MinCbSizeY = log2_min_luma_coding_block_size; Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size; MinCbSizeY = 1 << Log2MinCbSizeY; CtbSizeY = 1 << Log2CtbSizeY; PicWidthInMinCbsY = ceil_div(pic_width_in_luma_samples, MinCbSizeY); PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY); PicHeightInMinCbsY = ceil_div(pic_height_in_luma_samples, MinCbSizeY); PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY); PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY; PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY; PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples; if (chroma_format_idc==0 || separate_colour_plane_flag) { CtbWidthC = 0; CtbHeightC = 0; } else { CtbWidthC = CtbSizeY / SubWidthC; CtbHeightC = CtbSizeY / SubHeightC; } Log2MinTrafoSize = log2_min_transform_block_size; Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size; if (max_transform_hierarchy_depth_inter > Log2CtbSizeY - Log2MinTrafoSize) { if (sanitize_values) { max_transform_hierarchy_depth_inter = Log2CtbSizeY - Log2MinTrafoSize; } else { if (D) fprintf(stderr,"SPS error: transform hierarchy depth (inter) > CTB size - min TB size\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } } if (max_transform_hierarchy_depth_intra > Log2CtbSizeY - Log2MinTrafoSize) { if (sanitize_values) { max_transform_hierarchy_depth_intra = Log2CtbSizeY - Log2MinTrafoSize; } else { if (D) fprintf(stderr,"SPS error: transform hierarchy depth (intra) > CTB size - min TB size\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } } if (sanitize_values) { if (max_transform_hierarchy_depth_inter < Log2CtbSizeY - Log2MaxTrafoSize) { max_transform_hierarchy_depth_inter = Log2CtbSizeY - Log2MaxTrafoSize; } if (max_transform_hierarchy_depth_intra < Log2CtbSizeY - Log2MaxTrafoSize) { max_transform_hierarchy_depth_intra = Log2CtbSizeY - Log2MaxTrafoSize; } } Log2MinPUSize = Log2MinCbSizeY-1; PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize); PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize); Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size; Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size + log2_diff_max_min_pcm_luma_coding_block_size); // the following are not in the standard PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY; if (range_extension.high_precision_offsets_enabled_flag) { WpOffsetBdShiftY = 0; WpOffsetBdShiftC = 0; WpOffsetHalfRangeY = 1 << (BitDepth_Y - 1); WpOffsetHalfRangeC = 1 << (BitDepth_C - 1); } else { WpOffsetBdShiftY = ( BitDepth_Y - 8 ); WpOffsetBdShiftC = ( BitDepth_C - 8 ); WpOffsetHalfRangeY = 1 << 7; WpOffsetHalfRangeC = 1 << 7; } // --- check SPS sanity --- if (pic_width_in_luma_samples % MinCbSizeY != 0 || pic_height_in_luma_samples % MinCbSizeY != 0) { // TODO: warn that image size is coded wrong in bitstream (must be multiple of MinCbSizeY) if (D) fprintf(stderr,"SPS error: CB alignment\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (Log2MinTrafoSize > Log2MinCbSizeY) { if (D) fprintf(stderr,"SPS error: TB > CB\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (Log2MaxTrafoSize > libde265_min(Log2CtbSizeY,5)) { if (D) fprintf(stderr,"SPS error: TB_max > 32 or CTB\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (BitDepth_Y < 8 || BitDepth_Y > 16) { if (D) fprintf(stderr,"SPS error: bitdepth Y not in [8;16]\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (BitDepth_C < 8 || BitDepth_C > 16) { if (D) fprintf(stderr,"SPS error: bitdepth C not in [8;16]\n"); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } sps_read = true; return DE265_OK; } void seq_parameter_set::dump(int fd) const { //#if (_MSC_VER >= 1500) //#define LOG0(t) loginfo(LogHeaders, t) //#define LOG1(t,d) loginfo(LogHeaders, t,d) //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2) //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3) FILE* fh; if (fd==1) fh=stdout; else if (fd==2) fh=stderr; else { return; } #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) LOG0("----------------- SPS -----------------\n"); LOG1("video_parameter_set_id : %d\n", video_parameter_set_id); LOG1("sps_max_sub_layers : %d\n", sps_max_sub_layers); LOG1("sps_temporal_id_nesting_flag : %d\n", sps_temporal_id_nesting_flag); profile_tier_level_.dump(sps_max_sub_layers, fh); LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id); LOG2("chroma_format_idc : %d (%s)\n", chroma_format_idc, chroma_format_idc == 0 ? "monochrome" : chroma_format_idc == 1 ? "4:2:0" : chroma_format_idc == 2 ? "4:2:2" : chroma_format_idc == 3 ? "4:4:4" : "unknown"); if (chroma_format_idc == 3) { LOG1("separate_colour_plane_flag : %d\n", separate_colour_plane_flag); } LOG1("pic_width_in_luma_samples : %d\n", pic_width_in_luma_samples); LOG1("pic_height_in_luma_samples : %d\n", pic_height_in_luma_samples); LOG1("conformance_window_flag : %d\n", conformance_window_flag); if (conformance_window_flag) { LOG1("conf_win_left_offset : %d\n", conf_win_left_offset); LOG1("conf_win_right_offset : %d\n", conf_win_right_offset); LOG1("conf_win_top_offset : %d\n", conf_win_top_offset); LOG1("conf_win_bottom_offset: %d\n", conf_win_bottom_offset); } LOG1("bit_depth_luma : %d\n", bit_depth_luma); LOG1("bit_depth_chroma : %d\n", bit_depth_chroma); LOG1("log2_max_pic_order_cnt_lsb : %d\n", log2_max_pic_order_cnt_lsb); LOG1("sps_sub_layer_ordering_info_present_flag : %d\n", sps_sub_layer_ordering_info_present_flag); int firstLayer = (sps_sub_layer_ordering_info_present_flag ? 0 : sps_max_sub_layers-1 ); for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { LOG1("Layer %d\n",i); LOG1(" sps_max_dec_pic_buffering : %d\n", sps_max_dec_pic_buffering[i]); LOG1(" sps_max_num_reorder_pics : %d\n", sps_max_num_reorder_pics[i]); LOG1(" sps_max_latency_increase_plus1 : %d\n", sps_max_latency_increase_plus1[i]); } LOG1("log2_min_luma_coding_block_size : %d\n", log2_min_luma_coding_block_size); LOG1("log2_diff_max_min_luma_coding_block_size : %d\n",log2_diff_max_min_luma_coding_block_size); LOG1("log2_min_transform_block_size : %d\n", log2_min_transform_block_size); LOG1("log2_diff_max_min_transform_block_size : %d\n", log2_diff_max_min_transform_block_size); LOG1("max_transform_hierarchy_depth_inter : %d\n", max_transform_hierarchy_depth_inter); LOG1("max_transform_hierarchy_depth_intra : %d\n", max_transform_hierarchy_depth_intra); LOG1("scaling_list_enable_flag : %d\n", scaling_list_enable_flag); if (scaling_list_enable_flag) { LOG1("sps_scaling_list_data_present_flag : %d\n", sps_scaling_list_data_present_flag); if (sps_scaling_list_data_present_flag) { LOG0("scaling list logging output not implemented"); //assert(0); //scaling_list_data() } } LOG1("amp_enabled_flag : %d\n", amp_enabled_flag); LOG1("sample_adaptive_offset_enabled_flag : %d\n", sample_adaptive_offset_enabled_flag); LOG1("pcm_enabled_flag : %d\n", pcm_enabled_flag); if (pcm_enabled_flag) { LOG1("pcm_sample_bit_depth_luma : %d\n", pcm_sample_bit_depth_luma); LOG1("pcm_sample_bit_depth_chroma : %d\n", pcm_sample_bit_depth_chroma); LOG1("log2_min_pcm_luma_coding_block_size : %d\n", log2_min_pcm_luma_coding_block_size); LOG1("log2_diff_max_min_pcm_luma_coding_block_size : %d\n", log2_diff_max_min_pcm_luma_coding_block_size); LOG1("pcm_loop_filter_disable_flag : %d\n", pcm_loop_filter_disable_flag); } LOG1("num_short_term_ref_pic_sets : %d\n", ref_pic_sets.size()); for (size_t i = 0; i < ref_pic_sets.size(); i++) { LOG1("ref_pic_set[ %2d ]: ",i); dump_compact_short_term_ref_pic_set(&ref_pic_sets[i], 16, fh); } LOG1("long_term_ref_pics_present_flag : %d\n", long_term_ref_pics_present_flag); if (long_term_ref_pics_present_flag) { LOG1("num_long_term_ref_pics_sps : %d\n", num_long_term_ref_pics_sps); for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { LOG3("lt_ref_pic_poc_lsb_sps[%d] : %d (used_by_curr_pic_lt_sps_flag=%d)\n", i, lt_ref_pic_poc_lsb_sps[i], used_by_curr_pic_lt_sps_flag[i]); } } LOG1("sps_temporal_mvp_enabled_flag : %d\n", sps_temporal_mvp_enabled_flag); LOG1("strong_intra_smoothing_enable_flag : %d\n", strong_intra_smoothing_enable_flag); LOG1("vui_parameters_present_flag : %d\n", vui_parameters_present_flag); LOG1("sps_extension_present_flag : %d\n", sps_extension_present_flag); LOG1("sps_range_extension_flag : %d\n", sps_range_extension_flag); LOG1("sps_multilayer_extension_flag : %d\n", sps_multilayer_extension_flag); LOG1("sps_extension_6bits : %d\n", sps_extension_6bits); LOG1("CtbSizeY : %d\n", CtbSizeY); LOG1("MinCbSizeY : %d\n", MinCbSizeY); LOG1("MaxCbSizeY : %d\n", 1<<(log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size)); LOG1("MinTBSizeY : %d\n", 1<get_bits(1); if (!scaling_list_pred_mode_flag) { uint32_t scaling_list_pred_matrix_id_delta = br->get_uvlc(); if (scaling_list_pred_matrix_id_delta == UVLC_ERROR) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (sizeId == 3) { // adapt to our changed matrixId for size 3 scaling_list_pred_matrix_id_delta *= 3; } if (scaling_list_pred_matrix_id_delta > (uint32_t)matrixId) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } //printf("scaling_list_pred_matrix_id_delta=%d\n", scaling_list_pred_matrix_id_delta); dc_coeff[sizeId][matrixId] = 16; scaling_list_dc_coef = 16; if (scaling_list_pred_matrix_id_delta==0) { if (sizeId==0) { memcpy(curr_scaling_list, default_ScalingList_4x4, 16); } else { if (matrixId<3) { memcpy(curr_scaling_list, default_ScalingList_8x8_intra,64); } else { memcpy(curr_scaling_list, default_ScalingList_8x8_inter,64); } } } else { if (sizeId==3) { assert(scaling_list_pred_matrix_id_delta==3); } int mID = matrixId - scaling_list_pred_matrix_id_delta; int len = (sizeId == 0 ? 16 : 64); memcpy(curr_scaling_list, scaling_list[mID], len); scaling_list_dc_coef = dc_coeff[sizeId][mID]; dc_coeff[sizeId][matrixId] = dc_coeff[sizeId][mID]; } } else { int nextCoef=8; int coefNum = (sizeId==0 ? 16 : 64); if (sizeId>1) { scaling_list_dc_coef = br->get_svlc(); if (scaling_list_dc_coef < -7 || scaling_list_dc_coef > 247) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } scaling_list_dc_coef += 8; nextCoef=scaling_list_dc_coef; dc_coeff[sizeId][matrixId] = scaling_list_dc_coef; } else { scaling_list_dc_coef = 16; } //printf("DC = %d\n",scaling_list_dc_coef); for (int i=0;iget_svlc(); if (scaling_list_delta_coef < -128 || scaling_list_delta_coef > 127) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } nextCoef = (nextCoef + scaling_list_delta_coef + 256) % 256; curr_scaling_list[i] = nextCoef; //printf("curr %d = %d\n",i,nextCoef); } } // --- generate ScalingFactor arrays --- switch (sizeId) { case 0: fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], curr_scaling_list, 0); break; case 1: fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId][0][0], curr_scaling_list, 1); break; case 2: fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId][0][0], curr_scaling_list, 2); sclist->ScalingFactor_Size2[matrixId][0][0] = scaling_list_dc_coef; //printf("DC coeff: %d\n", scaling_list_dc_coef); break; case 3: fill_scaling_factor(&sclist->ScalingFactor_Size3[matrixId][0][0], curr_scaling_list, 3); sclist->ScalingFactor_Size3[matrixId][0][0] = scaling_list_dc_coef; //printf("DC coeff: %d\n", scaling_list_dc_coef); break; } } } // --- fill 32x32 matrices for chroma const position* scan = get_scan_order(3, 0 /* diag */); for (int matrixId=0;matrixId<6;matrixId++) if (matrixId!=0 && matrixId!=3) { for (int i=0;i<64;i++) { int x = scan[i].x; int y = scan[i].y; int v = sclist->ScalingFactor_Size1[matrixId][y][x]; for (int dy=0;dy<4;dy++) for (int dx=0;dx<4;dx++) { sclist->ScalingFactor_Size3[matrixId][4*y+dy][4*x+dx] = v; } } sclist->ScalingFactor_Size3[matrixId][0][0] = sclist->ScalingFactor_Size1[matrixId][0][0]; } return DE265_OK; } de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps, scaling_list_data* sclist, bool inPPS) { assert(false); // TODO return DE265_OK; } void set_default_scaling_lists(scaling_list_data* sclist) { // 4x4 for (int matrixId=0;matrixId<6;matrixId++) { fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], default_ScalingList_4x4, 0); } // 8x8 for (int matrixId=0;matrixId<3;matrixId++) { fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+0][0][0], default_ScalingList_8x8_intra, 1); fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+3][0][0], default_ScalingList_8x8_inter, 1); } // 16x16 for (int matrixId=0;matrixId<3;matrixId++) { fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+0][0][0], default_ScalingList_8x8_intra, 2); fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+3][0][0], default_ScalingList_8x8_inter, 2); } // 32x32 fill_scaling_factor(&sclist->ScalingFactor_Size3[0][0][0], default_ScalingList_8x8_intra, 3); fill_scaling_factor(&sclist->ScalingFactor_Size3[1][0][0], default_ScalingList_8x8_inter, 3); } de265_error seq_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) { out.write_bits(video_parameter_set_id, 4); if (sps_max_sub_layers>7) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } out.write_bits(sps_max_sub_layers-1, 3); out.write_bit(sps_temporal_id_nesting_flag); profile_tier_level_.write(out, sps_max_sub_layers); out.write_uvlc(seq_parameter_set_id); // --- encode chroma type --- out.write_uvlc(chroma_format_idc); if (chroma_format_idc>3) { errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (chroma_format_idc == 3) { out.write_bit(separate_colour_plane_flag); } // --- picture size --- out.write_uvlc(pic_width_in_luma_samples); out.write_uvlc(pic_height_in_luma_samples); out.write_bit(conformance_window_flag); if (conformance_window_flag) { out.write_uvlc(conf_win_left_offset); out.write_uvlc(conf_win_right_offset); out.write_uvlc(conf_win_top_offset); out.write_uvlc(conf_win_bottom_offset); } out.write_uvlc(bit_depth_luma-8); out.write_uvlc(bit_depth_chroma-8); out.write_uvlc(log2_max_pic_order_cnt_lsb-4); // --- sub_layer_ordering_info --- out.write_bit(sps_sub_layer_ordering_info_present_flag); int firstLayer = (sps_sub_layer_ordering_info_present_flag ? 0 : sps_max_sub_layers-1 ); for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { // sps_max_dec_pic_buffering[i] if (sps_max_dec_pic_buffering[i] > MAX_NUM_REF_PICS) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } out.write_uvlc(sps_max_dec_pic_buffering[i]-1); // sps_max_num_reorder_pics[i] out.write_uvlc(sps_max_num_reorder_pics[i]); // sps_max_latency_increase[i] out.write_uvlc(sps_max_latency_increase_plus1[i]); } out.write_uvlc(log2_min_luma_coding_block_size-3); out.write_uvlc(log2_diff_max_min_luma_coding_block_size); out.write_uvlc(log2_min_transform_block_size-2); out.write_uvlc(log2_diff_max_min_transform_block_size); out.write_uvlc(max_transform_hierarchy_depth_inter); out.write_uvlc(max_transform_hierarchy_depth_intra); out.write_bit(scaling_list_enable_flag); if (scaling_list_enable_flag) { out.write_bit(sps_scaling_list_data_present_flag); if (sps_scaling_list_data_present_flag) { de265_error err; if ((err=write_scaling_list(out,this, &scaling_list, false)) != DE265_OK) { return err; } } } out.write_bit(amp_enabled_flag); out.write_bit(sample_adaptive_offset_enabled_flag); out.write_bit(pcm_enabled_flag); if (pcm_enabled_flag) { out.write_bits(pcm_sample_bit_depth_luma -1,4); out.write_bits(pcm_sample_bit_depth_chroma-1,4); out.write_uvlc(log2_min_pcm_luma_coding_block_size-3); out.write_uvlc(log2_diff_max_min_pcm_luma_coding_block_size); out.write_bit(pcm_loop_filter_disable_flag); } int num_short_term_ref_pic_sets = ref_pic_sets.size(); if (num_short_term_ref_pic_sets < 0 || num_short_term_ref_pic_sets > 64) { errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } out.write_uvlc(num_short_term_ref_pic_sets); // --- allocate reference pic set --- // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself for (int i = 0; i < num_short_term_ref_pic_sets; i++) { bool success = write_short_term_ref_pic_set(errqueue,this,out, &ref_pic_sets[i], i, ref_pic_sets, false); if (!success) { return DE265_WARNING_SPS_HEADER_INVALID; } // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh); } out.write_bit(long_term_ref_pics_present_flag); if (long_term_ref_pics_present_flag) { if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } out.write_uvlc(num_long_term_ref_pics_sps); for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { out.write_bits(lt_ref_pic_poc_lsb_sps[i], log2_max_pic_order_cnt_lsb); out.write_bit (used_by_curr_pic_lt_sps_flag[i]); } } out.write_bit(sps_temporal_mvp_enabled_flag); out.write_bit(strong_intra_smoothing_enable_flag); out.write_bit(vui_parameters_present_flag); #if 0 if (vui_parameters_present_flag) { assert(false); /* vui_parameters() sps_extension_flag u(1) if( sps_extension_flag ) while( more_rbsp_data() ) sps_extension_data_flag u(1) rbsp_trailing_bits() */ } #endif out.write_bit(sps_extension_present_flag); #if 0 if (sps_extension_flag) { assert(false); } br->check_rbsp_trailing_bits(); #endif // --- compute derived values --- #if 0 BitDepth_Y = bit_depth_luma; QpBdOffset_Y = 6*(bit_depth_luma-8); BitDepth_C = bit_depth_chroma; QpBdOffset_C = 6*(bit_depth_chroma-8); Log2MinCbSizeY = log2_min_luma_coding_block_size; Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size; MinCbSizeY = 1 << Log2MinCbSizeY; CtbSizeY = 1 << Log2CtbSizeY; PicWidthInMinCbsY = pic_width_in_luma_samples / MinCbSizeY; PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY); PicHeightInMinCbsY = pic_height_in_luma_samples / MinCbSizeY; PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY); PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY; PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY; PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples; if (chroma_format_idc==0 || separate_colour_plane_flag) { CtbWidthC = 0; CtbHeightC = 0; } else { CtbWidthC = CtbSizeY / SubWidthC; CtbHeightC = CtbSizeY / SubHeightC; } Log2MinTrafoSize = log2_min_transform_block_size; Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size; Log2MinPUSize = Log2MinCbSizeY-1; PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize); PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize); Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size; Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size + log2_diff_max_min_pcm_luma_coding_block_size); // the following are not in the standard PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY; sps_read = true; #endif return DE265_OK; } de265_error sps_range_extension::read(error_queue* errqueue, bitreader* br) { transform_skip_rotation_enabled_flag = br->get_bits(1); transform_skip_context_enabled_flag = br->get_bits(1); implicit_rdpcm_enabled_flag = br->get_bits(1); explicit_rdpcm_enabled_flag = br->get_bits(1); extended_precision_processing_flag = br->get_bits(1); intra_smoothing_disabled_flag = br->get_bits(1); high_precision_offsets_enabled_flag = br->get_bits(1); persistent_rice_adaptation_enabled_flag = br->get_bits(1); cabac_bypass_alignment_enabled_flag = br->get_bits(1); return DE265_OK; } #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) void sps_range_extension::dump(int fd) const { FILE* fh; if (fd==1) fh=stdout; else if (fd==2) fh=stderr; else { return; } LOG0("----------------- SPS-range-extension -----------------\n"); LOG1("transform_skip_rotation_enabled_flag : %d\n", transform_skip_rotation_enabled_flag); LOG1("transform_skip_context_enabled_flag : %d\n", transform_skip_context_enabled_flag); LOG1("implicit_rdpcm_enabled_flag : %d\n", implicit_rdpcm_enabled_flag); LOG1("explicit_rdpcm_enabled_flag : %d\n", explicit_rdpcm_enabled_flag); LOG1("extended_precision_processing_flag : %d\n", extended_precision_processing_flag); LOG1("intra_smoothing_disabled_flag : %d\n", intra_smoothing_disabled_flag); LOG1("high_precision_offsets_enabled_flag : %d\n", high_precision_offsets_enabled_flag); LOG1("persistent_rice_adaptation_enabled_flag : %d\n", persistent_rice_adaptation_enabled_flag); LOG1("cabac_bypass_alignment_enabled_flag : %d\n", cabac_bypass_alignment_enabled_flag); } #undef LOG1 #undef LOG0 libde265-1.0.18/libde265/sps.h000066400000000000000000000163271515675107500154340ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_SPS_H #define DE265_SPS_H #include "libde265/vps.h" #include "libde265/vui.h" #include "libde265/bitstream.h" #include "libde265/refpic.h" #include "libde265/de265.h" #include "libde265/cabac.h" #include class error_queue; // #define MAX_REF_PIC_SETS 64 // maximum according to standard constexpr int MAX_NUM_LT_REF_PICS_SPS = 32; // This is just a safety range. It is chosen such that width/height fits into 16bit integers and the total number of pixels in 32bit integers. constexpr int MAX_PICTURE_WIDTH = 65535; constexpr int MAX_PICTURE_HEIGHT = 65535; enum { CHROMA_MONO = 0, CHROMA_420 = 1, CHROMA_422 = 2, CHROMA_444 = 3, CHROMA_444_SEPARATE }; struct scaling_list_data { // structure size: approx. 4 kB uint8_t ScalingFactor_Size0[6][4][4]; uint8_t ScalingFactor_Size1[6][8][8]; uint8_t ScalingFactor_Size2[6][16][16]; uint8_t ScalingFactor_Size3[6][32][32]; }; enum PresetSet { Preset_Default }; class sps_range_extension { public: sps_range_extension(); uint8_t transform_skip_rotation_enabled_flag = 0; uint8_t transform_skip_context_enabled_flag = 0; uint8_t implicit_rdpcm_enabled_flag = 0; uint8_t explicit_rdpcm_enabled_flag = 0; uint8_t extended_precision_processing_flag = 0; uint8_t intra_smoothing_disabled_flag = 0; uint8_t high_precision_offsets_enabled_flag = 0; uint8_t persistent_rice_adaptation_enabled_flag = 0; uint8_t cabac_bypass_alignment_enabled_flag = 0; de265_error read(error_queue*, bitreader*); void dump(int fd) const; }; class seq_parameter_set { public: seq_parameter_set(); ~seq_parameter_set(); de265_error read(error_queue*, bitreader*); de265_error write(error_queue*, CABAC_encoder&); void dump(int fd) const; void set_defaults(enum PresetSet = Preset_Default); void set_CB_log2size_range(int mini,int maxi); void set_TB_log2size_range(int mini,int maxi); void set_resolution(int w,int h); bool sps_read = false; // whether the sps has been read from the bitstream uint8_t video_parameter_set_id; uint8_t sps_max_sub_layers; // [1;7] bool sps_temporal_id_nesting_flag; profile_tier_level profile_tier_level_; uint8_t seq_parameter_set_id; // [0;15] uint8_t chroma_format_idc; // [0;3] bool separate_colour_plane_flag; int pic_width_in_luma_samples; int pic_height_in_luma_samples; bool conformance_window_flag; int conf_win_left_offset; int conf_win_right_offset; int conf_win_top_offset; int conf_win_bottom_offset; uint8_t bit_depth_luma; // [8;16] uint8_t bit_depth_chroma; // [8;16] uint8_t log2_max_pic_order_cnt_lsb; // [4;16] bool sps_sub_layer_ordering_info_present_flag; uint8_t sps_max_dec_pic_buffering[7]; // for each temporal layer uint8_t sps_max_num_reorder_pics[7]; uint32_t sps_max_latency_increase_plus1[7]; bool sps_max_latency_increase_present[7] = {}; uint8_t log2_min_luma_coding_block_size; // smallest CB size [3;6] uint8_t log2_diff_max_min_luma_coding_block_size; // largest CB size uint8_t log2_min_transform_block_size; // smallest TB size [2;5] uint8_t log2_diff_max_min_transform_block_size; // largest TB size uint8_t max_transform_hierarchy_depth_inter; uint8_t max_transform_hierarchy_depth_intra; bool scaling_list_enable_flag; bool sps_scaling_list_data_present_flag; /* if not set, the default scaling lists will be set in scaling_list */ struct scaling_list_data scaling_list; bool amp_enabled_flag; bool sample_adaptive_offset_enabled_flag; bool pcm_enabled_flag; uint8_t pcm_sample_bit_depth_luma; uint8_t pcm_sample_bit_depth_chroma; int log2_min_pcm_luma_coding_block_size; int log2_diff_max_min_pcm_luma_coding_block_size; bool pcm_loop_filter_disable_flag; int num_short_term_ref_pic_sets() const { return ref_pic_sets.size(); } std::vector ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) ) bool long_term_ref_pics_present_flag; uint8_t num_long_term_ref_pics_sps; // [0;32] int lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS]; bool used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS]; bool sps_temporal_mvp_enabled_flag; bool strong_intra_smoothing_enable_flag; bool vui_parameters_present_flag; video_usability_information vui; bool sps_extension_present_flag; bool sps_range_extension_flag; bool sps_multilayer_extension_flag; uint8_t sps_extension_6bits; sps_range_extension range_extension; /* if( sps_extension_flag ) while( more_rbsp_data() ) sps_extension_data_flag u(1) rbsp_trailing_bits() */ // --- derived values --- de265_error compute_derived_values(bool sanitize_values = false); int BitDepth_Y; int QpBdOffset_Y; int BitDepth_C; int QpBdOffset_C; int ChromaArrayType; int SubWidthC, SubHeightC; int WinUnitX, WinUnitY; int MaxPicOrderCntLsb; uint8_t Log2MinCbSizeY; uint8_t Log2CtbSizeY; uint8_t MinCbSizeY; uint8_t CtbSizeY; uint16_t PicWidthInMinCbsY; uint16_t PicWidthInCtbsY; uint16_t PicHeightInMinCbsY; uint16_t PicHeightInCtbsY; uint32_t PicSizeInMinCbsY; uint32_t PicSizeInCtbsY; uint32_t PicSizeInSamplesY; int CtbWidthC, CtbHeightC; int PicWidthInTbsY; // not in standard int PicHeightInTbsY; // not in standard int PicSizeInTbsY; // not in standard int Log2MinTrafoSize; int Log2MaxTrafoSize; int Log2MinPUSize; int PicWidthInMinPUs; // might be rounded up int PicHeightInMinPUs; // might be rounded up int Log2MinIpcmCbSizeY; int Log2MaxIpcmCbSizeY; int SpsMaxLatencyPictures[7] = {}; // [temporal layer] uint8_t WpOffsetBdShiftY; uint8_t WpOffsetBdShiftC; uint16_t WpOffsetHalfRangeY; uint16_t WpOffsetHalfRangeC; int getPUIndexRS(int pixelX,int pixelY) const { return (pixelX>>Log2MinPUSize) + (pixelY>>Log2MinPUSize)*PicWidthInMinPUs; } int get_bit_depth(int cIdx) const { if (cIdx==0) return BitDepth_Y; else return BitDepth_C; } int get_chroma_shift_W(int cIdx) const { return cIdx ? SubWidthC -1 : 0; } int get_chroma_shift_H(int cIdx) const { return cIdx ? SubHeightC-1 : 0; } }; de265_error read_scaling_list(bitreader*, const seq_parameter_set*, scaling_list_data*, bool inPPS); de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps, scaling_list_data* sclist, bool inPPS); void set_default_scaling_lists(scaling_list_data*); #endif libde265-1.0.18/libde265/threads.cc000066400000000000000000000113251515675107500164100ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "threads.h" #include #include #if defined(_MSC_VER) || defined(__MINGW32__) # include #elif defined(HAVE_ALLOCA_H) # include #endif de265_progress_lock::de265_progress_lock() { mProgress = 0; } de265_progress_lock::~de265_progress_lock() { } void de265_progress_lock::wait_for_progress(int progress) { if (mProgress >= progress) { return; } std::unique_lock lock(mutex); while (mProgress < progress) { cond.wait(lock); } } void de265_progress_lock::set_progress(int progress) { std::unique_lock lock(mutex); if (progress>mProgress) { mProgress = progress; cond.notify_all(); } } void de265_progress_lock::increase_progress(int progress) { std::unique_lock lock(mutex); mProgress += progress; cond.notify_all(); } int de265_progress_lock::get_progress() const { return mProgress; } #include "libde265/decctx.h" #if 0 const char* line="--------------------------------------------------"; void printblks(const thread_pool* pool) { int w = pool->tasks[0].data.task_ctb.ctx->current_sps->PicWidthInCtbsY; int h = pool->tasks[0].data.task_ctb.ctx->current_sps->PicHeightInCtbsY; printf("active threads: %d queue len: %d\n",pool->num_threads_working,pool->num_tasks); char *const p = (char *)alloca(w * h * sizeof(char)); assert(p != nullptr); memset(p,' ',w*h); for (int i=0;inum_tasks;i++) { int b = 0; //pool->tasks[i].num_blockers; int x = pool->tasks[i].data.task_ctb.ctb_x; int y = pool->tasks[i].data.task_ctb.ctb_y; p[y*w+x] = b+'0'; } for (int i=0;inum_threads_working;i++) { int x = pool->ctbx[i]; int y = pool->ctby[i]; p[y*w+x] = '*'; } printf("+%s+\n",line+50-w); for (int y=0;y lock(pool->mutex); // wait until we can pick a task or until the pool has been stopped for (;;) { // end waiting if thread-pool has been stopped or we have a task to execute if (pool->stopped || pool->tasks.size()>0) { break; } //printf("going idle\n"); pool->cond_var.wait(lock); } // if the pool was shut down, end the execution if (pool->stopped) { return; } // get a task task = pool->tasks.front(); pool->tasks.pop_front(); pool->num_threads_working++; //printblks(pool); } // execute the task task->work(); // end processing and check if this was the last task to be processed // TODO: the num_threads_working can probably be an atomic integer std::unique_lock lock(pool->mutex); pool->num_threads_working--; } } de265_error thread_pool::start(int num_threads_to_start) { de265_error err = DE265_OK; // limit number of threads to maximum if (num_threads_to_start > MAX_THREADS) { num_threads_to_start = MAX_THREADS; err = DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM; } num_threads = 0; // will be increased below { std::unique_lock lock(mutex); num_threads_working = 0; stopped = false; } // start worker threads for (int i=0; i lock(mutex); stopped = true; } cond_var.notify_all(); for (int i=0;i lock(mutex); if (!stopped) { tasks.push_back(task); // wake up one thread cond_var.notify_one(); } } libde265-1.0.18/libde265/threads.h000066400000000000000000000050521515675107500162520ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_THREADS_H #define DE265_THREADS_H #include "libde265/de265.h" #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #ifdef _WIN32 #if !defined(NOMINMAX) #define NOMINMAX #endif #include #include "../extra/win32cond.h" #if _MSC_VER > 1310 #include #endif #endif // _WIN32 #include #include #include class de265_progress_lock { public: de265_progress_lock(); ~de265_progress_lock(); void wait_for_progress(int progress); void set_progress(int progress); void increase_progress(int progress); int get_progress() const; void reset(int value=0) { mProgress=value; } private: int mProgress; // private data std::mutex mutex; std::condition_variable cond; }; class thread_task { public: thread_task() : state(Queued) { } virtual ~thread_task() { } enum { Queued, Running, Blocked, Finished } state; virtual void work() = 0; virtual std::string name() const { return "noname"; } }; constexpr int MAX_THREADS = 32; /* TODO NOTE: When unblocking a task, we have to check first if there are threads waiting because of the run-count limit. If there are higher-priority tasks, those should be run instead of the just unblocked task. */ class thread_pool { public: de265_error start(int num_threads); void stop(); // do not process remaining tasks void add_task(thread_task* task); bool stopped; std::deque tasks; // we are not the owner int num_threads_working; std::mutex mutex; std::condition_variable cond_var; private: std::thread thread[MAX_THREADS]; int num_threads; //int ctbx[MAX_THREADS]; // the CTB the thread is working on //int ctby[MAX_THREADS]; }; #endif libde265-1.0.18/libde265/transform.cc000066400000000000000000000540171515675107500167760ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "transform.h" #include "util.h" #include const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ }; // (8.6.1) void decode_quantization_parameters(thread_context* tctx, int xC,int yC, int xCUBase, int yCUBase) { logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC); const pic_parameter_set& pps = tctx->img->get_pps(); const seq_parameter_set& sps = tctx->img->get_sps(); slice_segment_header* shdr = tctx->shdr; // top left pixel position of current quantization group int xQG = xCUBase - (xCUBase & ((1<currentQG_x && yQG == tctx->currentQG_y) { return; } */ // if first QG in CU, remember last QPY of last CU previous QG if (xQG != tctx->currentQG_x || yQG != tctx->currentQG_y) { tctx->lastQPYinPreviousQG = tctx->currentQPY; tctx->currentQG_x = xQG; tctx->currentQG_y = yQG; } int qPY_PRED; // first QG in CTB row ? int ctbLSBMask = ((1<shdr->SliceAddrRS; int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY; int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY; bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG); // first QG in tile ? bool firstQGInTile = false; if (pps.tiles_enabled_flag) { if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 && (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0) { int ctbX = xQG >> sps.Log2CtbSizeY; int ctbY = yQG >> sps.Log2CtbSizeY; firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow } } if (firstQGInSlice || firstQGInTile || (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) { qPY_PRED = tctx->shdr->SliceQPY; } else { qPY_PRED = tctx->lastQPYinPreviousQG; } int qPYA,qPYB; if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) { int xTmp = (xQG-1) >> sps.Log2MinTrafoSize; int yTmp = (yQG ) >> sps.Log2MinTrafoSize; int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; uint32_t ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); if (ctbAddrA == tctx->CtbAddrInTS) { qPYA = tctx->img->get_QPY(xQG-1,yQG); } else { qPYA = qPY_PRED; } } else { qPYA = qPY_PRED; } if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) { int xTmp = (xQG ) >> sps.Log2MinTrafoSize; int yTmp = (yQG-1) >> sps.Log2MinTrafoSize; uint32_t minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; uint32_t ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); if (ctbAddrB == tctx->CtbAddrInTS) { qPYB = tctx->img->get_QPY(xQG,yQG-1); } else { qPYB = qPY_PRED; } } else { qPYB = qPY_PRED; } qPY_PRED = (qPYA + qPYB + 1)>>1; logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB); int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) % (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y; assert(QPY >= -sps.QpBdOffset_Y && QPY <= 51); tctx->qPYPrime = QPY + sps.QpBdOffset_Y; int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb); int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr); logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n", qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset, qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset); int qPCb,qPCr; if (sps.ChromaArrayType == CHROMA_420) { qPCb = table8_22(qPiCb); qPCr = table8_22(qPiCr); } else { qPCb = qPiCb; qPCr = qPiCr; } //printf("q: %d %d\n",qPiCb, qPCb); tctx->qPCbPrime = qPCb + sps.QpBdOffset_C; if (tctx->qPCbPrime<0) { tctx->qPCbPrime = 0; } tctx->qPCrPrime = qPCr + sps.QpBdOffset_C; if (tctx->qPCrPrime<0) { tctx->qPCrPrime = 0; } /* printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY, sps->QpBdOffset_Y, pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset, pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset, sps->QpBdOffset_C, sps->QpBdOffset_C, tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime); */ int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase); // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why. // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit. // id:000163,sig:06,src:002041,op:havoc,rep:16.bin if (log2CbSize<3) { log2CbSize=3; } tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY); tctx->currentQPY = QPY; /* printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase, xCUBase+(1<qPYPrime); } template void transform_coefficients(acceleration_functions* acceleration, int16_t* coeff, int coeffStride, int nT, int trType, pixel_t* dst, int dstStride, int bit_depth) { logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); if (trType==1) { acceleration->transform_4x4_dst_add(dst, coeff, dstStride, bit_depth); } else { /**/ if (nT==4) { acceleration->transform_add(0,dst,coeff,dstStride, bit_depth); } else if (nT==8) { acceleration->transform_add(1,dst,coeff,dstStride, bit_depth); } else if (nT==16) { acceleration->transform_add(2,dst,coeff,dstStride, bit_depth); } else { acceleration->transform_add(3,dst,coeff,dstStride, bit_depth); } } #if 0 printf("decoded pixels:\n"); for (int y=0;yimg->get_sps().BitDepth_C; const int BitDepthY = tctx->img->get_sps().BitDepth_Y; for (int y=0;yBitDepthC, for which we could also eliminate one shift. The remaining case is also one shift only. */ residual[y*nT+x] += (tctx->ResScaleVal * static_cast((static_cast(tctx->residual_luma[y*nT+x]) << BitDepthC ) >> BitDepthY ) ) >> 3; } } template void transform_coefficients_explicit(thread_context* tctx, int16_t* coeff, int coeffStride, int nT, int trType, pixel_t* dst, int dstStride, int bit_depth, int cIdx) { logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); const acceleration_functions* acceleration = &tctx->decctx->acceleration; int32_t residual_buffer[32*32]; int32_t* residual; if (cIdx==0) { residual = tctx->residual_luma; } else { residual = residual_buffer; } // TODO int bdShift = 20 - bit_depth; int max_coeff_bits = 15; if (trType==1) { acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); } else { /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } } //printBlk("prediction",(uint8_t*)dst,nT,dstStride); //printBlk("residual",residual,nT,nT); if (cIdx != 0) { if (tctx->ResScaleVal != 0) { cross_comp_pred(tctx, residual, nT); } //printBlk("cross-comp-pred modified residual",residual,nT,nT); } acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); } void inv_transform(acceleration_functions* acceleration, uint8_t* dst, int dstStride, int16_t* coeff, int log2TbSize, int trType) { if (trType==1) { assert(log2TbSize==2); acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride); } else { acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride); } #if 0 int nT = 1<fwd_transform_4x4_dst_8(coeff, src, srcStride); } else { // DCT 4x4, 8x8, 16x16, 32x32 acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride); } } static const int levelScale[] = { 40,45,51,57,64,72 }; // (8.6.2) and (8.6.3) template void scale_coefficients_internal(thread_context* tctx, int xT,int yT, // position of TU in frame (chroma adapted) int x0,int y0, // position of CU in frame (chroma adapted) int nT, int cIdx, bool transform_skip_flag, bool intra, int rdpcmMode) { const seq_parameter_set& sps = tctx->img->get_sps(); const pic_parameter_set& pps = tctx->img->get_pps(); int qP; switch (cIdx) { case 0: qP = tctx->qPYPrime; break; case 1: qP = tctx->qPCbPrime; break; case 2: qP = tctx->qPCrPrime; break; default: qP = 0; assert(0); break; // should never happen } logtrace(LogTransform,"qP: %d\n",qP); int16_t* coeff; int coeffStride; coeff = tctx->coeffBuf; coeffStride = nT; pixel_t* pred; int stride; pred = tctx->img->get_image_plane_at_pos_NEW(cIdx, xT,yT); stride = tctx->img->get_image_stride(cIdx); // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler // can optimize away a lot of code for 8-bit pixels. const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && nT == 4 && cuPredModeIntra); if (tctx->cu_transquant_bypass_flag) { int32_t residual_buffer[32*32]; int32_t* residual; if (cIdx==0) residual = tctx->residual_luma; else residual = residual_buffer; // TODO: we could fold the coefficient rotation into the coefficient expansion here: for (int i=0;inCoeff[cIdx];i++) { int32_t currCoeff = tctx->coeffList[cIdx][i]; tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; } if (rotateCoeffs) { tctx->decctx->acceleration.rotate_coefficients(coeff, nT); } if (rdpcmMode) { if (rdpcmMode==2) tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); else tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); } else { tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); } if (cIdx != 0) { if (tctx->ResScaleVal != 0) { cross_comp_pred(tctx, residual, nT); } } tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); if (rotateCoeffs) { memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around } } else { // (8.6.3) int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; logtrace(LogTransform,"bdShift=%d\n",bdShift); logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); // --- inverse quantization --- if (sps.scaling_list_enable_flag==0) { //const int m_x_y = 16; const int m_x_y = 1; bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers const int offset = (1<<(bdShift-1)); const int fact = m_x_y * levelScale[qP%6] << (qP/6); for (int i=0;inCoeff[cIdx];i++) { int64_t currCoeff = tctx->coeffList[cIdx][i]; //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], //tctx->coeffList[cIdx][i]); currCoeff = Clip3(-32768,32767, ( (currCoeff * fact + offset ) >> bdShift)); //logtrace(LogTransform," -> %d\n",currCoeff); tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; } } else { const int offset = (1<<(bdShift-1)); const uint8_t* sclist; int matrixID = cIdx; if (nT==32) { matrixID=0; } if (!intra) { if (nT<32) { matrixID += 3; } else { matrixID++; } } switch (nT) { case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; default: assert(0); sclist = nullptr; } for (int i=0;inCoeff[cIdx];i++) { int pos = tctx->coeffPos[cIdx][i]; const int m_x_y = sclist[pos]; const int fact = m_x_y * levelScale[qP%6] << (qP/6); int64_t currCoeff = tctx->coeffList[cIdx][i]; currCoeff = Clip3(-32768,32767, ( (currCoeff * fact + offset ) >> bdShift)); tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; } } // --- do transform or skip --- logtrace(LogTransform,"coefficients OUT:\n"); for (int y=0;ydecctx->acceleration.rotate_coefficients(coeff, nT); } int32_t residual_buffer[32*32]; int32_t* residual; if (cIdx==0) residual = tctx->residual_luma; else residual = residual_buffer; if (rdpcmMode) { /* if (rdpcmMode==2) tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); else tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); */ if (rdpcmMode==2) tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); else tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); } else { //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); } if (cIdx != 0) { if (tctx->ResScaleVal != 0) { cross_comp_pred(tctx, residual, nT); } } tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); if (rotateCoeffs) { memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around } } else { int trType; //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { if (nT==4 && cIdx==0 && cuPredModeIntra) { trType=1; } else { trType=0; } assert(rdpcmMode==0); if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { // cross-component-prediction: transform to residual buffer and add in a separate step transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, pred, stride, bit_depth, cIdx); } else { transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, pred, stride, bit_depth); } } } logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); for (int y=0;ynCoeff[cIdx];i++) { tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; } } void scale_coefficients(thread_context* tctx, int xT,int yT, // position of TU in frame (chroma adapted) int x0,int y0, // position of CU in frame (chroma adapted) int nT, int cIdx, bool transform_skip_flag, bool intra, int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical ) { if (tctx->img->high_bit_depth(cIdx)) { scale_coefficients_internal(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, rdpcmMode); } else { scale_coefficients_internal (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, rdpcmMode); } } //#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20 #define QUANT_SHIFT 14 // Q(4) = 2^14 //#define SCALE_BITS 15 // Inherited from TMuC, presumably for fractional bit estimates in RDOQ #define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit) const static uint16_t g_quantScales[6] = { 26214,23302,20560,18396,16384,14564 }; void quant_coefficients(//encoder_context* ectx, int16_t* out_coeff, const int16_t* in_coeff, int log2TrSize, int qp, bool intra) { const int qpDiv6 = qp / 6; const int qpMod6 = qp % 6; //int uiLog2TrSize = xLog2( iWidth - 1); int uiQ = g_quantScales[qpMod6]; int bitDepth = 8; int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize; // Represents scaling through forward transform int qBits = QUANT_SHIFT + qpDiv6 + transformShift; /* TODO: originally, this was checking for intra slices, why not for intra mode ? */ int rnd = (intra ? 171 : 85) << (qBits-9); int x, y; int nStride = (1< ", x,y,level); sign = (level < 0 ? -1: 1); level = (abs_value(level) * uiQ + rnd ) >> qBits; level *= sign; out_coeff[blockPos] = Clip3(-32768, 32767, level); //logtrace(LogTransform,"%d\n", out_coeff[blockPos]); } } } void dequant_coefficients(int16_t* out_coeff, const int16_t* in_coeff, int log2TrSize, int qP) { const int m_x_y = 1; int bitDepth = 8; int bdShift = bitDepth + log2TrSize - 5; bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers const int offset = (1<<(bdShift-1)); const int fact = m_x_y * levelScale[qP%6] << (qP/6); //int blkSize = (1<> bdShift)); //logtrace(LogTransform," -> %d\n",currCoeff); out_coeff[i] = currCoeff; } } libde265-1.0.18/libde265/transform.h000066400000000000000000000043231515675107500166330ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_TRANSFORM_H #define DE265_TRANSFORM_H #include "libde265/de265.h" #include "libde265/decctx.h" extern const int tab8_22[]; LIBDE265_INLINE static int table8_22(int qPi) { if (qPi<30) return qPi; if (qPi>=43) return qPi-6; return tab8_22[qPi-30]; } // (8.6.1) void decode_quantization_parameters(thread_context* tctx, int xC,int yC, int xCUBase, int yCUBase); // (8.6.2) void scale_coefficients(thread_context* tctx, int xT,int yT, // position of TU in frame (chroma adapted) int x0,int y0, // position of CU in frame (chroma adapted) int nT, int cIdx, bool transform_skip_flag, bool intra, int rdpcmMode); void inv_transform(acceleration_functions* acceleration, uint8_t* dst, int dstStride, int16_t* coeff, int log2TbSize, int trType); void fwd_transform(acceleration_functions* acceleration, int16_t* coeff, int coeffStride, int log2TbSize, int trType, const int16_t* src, int srcStride); void quant_coefficients(int16_t* out_coeff, const int16_t* in_coeff, int log2TrSize, int qp, bool intra); void dequant_coefficients(int16_t* out_coeff, const int16_t* in_coeff, int log2TrSize, int qP); #endif libde265-1.0.18/libde265/util.cc000066400000000000000000000134151515675107500157350ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "util.h" #include "de265.h" #include #include #include void copy_subimage(uint8_t* dst,int dststride, const uint8_t* src,int srcstride, int w, int h) { for (int y=0;y=2; } #endif #ifdef DE265_LOG_TRACE void logtrace(enum LogModule module, const char* string, ...) { if (verbosity<3) return; if (current_poc < log_poc_start) { return; } if (disable_log[module]) return; //if (module != LogSymbols /*&& module != LogCABAC*/) { return; } //if (logcnt<319500) return; //if (module != LogCABAC) return; va_list va; if (string[0]=='$') { int id = string[1]-'0'; logcnt[id]++; fprintf(stdout, "[%ld] ",logcnt[id]); string += 3; } int noPrefix = (string[0]=='*'); if (!noPrefix) { } // fprintf(stdout, "ERR: "); va_start(va, string); vfprintf(stdout, string + (noPrefix ? 1 : 0), va); va_end(va); fflush(stdout); } #endif void log2fh(FILE* fh, const char* string, ...) { va_list va; int noPrefix = (string[0]=='*'); if (!noPrefix) fprintf(stdout, "INFO: "); va_start(va, string); vfprintf(fh, string + (noPrefix ? 1 : 0), va); va_end(va); fflush(stdout); } void printBlk(const char* title, const int16_t* data, int blksize, int stride, const std::string& prefix) { if (title) printf("%s%s:\n",prefix.c_str(),title); for (int y=0;y * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_UTIL_H #define DE265_UTIL_H #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifndef _MSC_VER #include #endif #include #include #include "libde265/de265.h" #ifdef __GNUC__ #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #endif #ifdef _MSC_VER #define LIBDE265_DECLARE_ALIGNED( var, n ) __declspec(align(n)) var #define likely(x) (x) #define unlikely(x) (x) #else #define LIBDE265_DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #endif #if defined(__GNUC__) && (__GNUC__ >= 4) #define LIBDE265_CHECK_RESULT __attribute__ ((warn_unused_result)) #elif defined(_MSC_VER) && (_MSC_VER >= 1700) #define LIBDE265_CHECK_RESULT _Check_return_ #else #define LIBDE265_CHECK_RESULT #endif // Be careful with these alignment instructions. They only specify the alignment within // a struct. But they cannot make sure that the base address of the struct has the same alignment // when it is dynamically allocated. #define ALIGNED_32( var ) LIBDE265_DECLARE_ALIGNED( var, 32 ) #define ALIGNED_16( var ) LIBDE265_DECLARE_ALIGNED( var, 16 ) #define ALIGNED_8( var ) LIBDE265_DECLARE_ALIGNED( var, 8 ) #define ALIGNED_4( var ) LIBDE265_DECLARE_ALIGNED( var, 4 ) #ifdef _MSC_VER #ifdef _CPPRTTI #define RTTI_ENABLED #endif #else #ifdef __GXX_RTTI #define RTTI_ENABLED #endif #endif //inline uint8_t Clip1_8bit(int16_t value) { if (value<=0) return 0; else if (value>=255) return 255; else return value; } #define Clip1_8bit(value) ((value)<0 ? 0 : (value)>255 ? 255 : (value)) #define Clip_BitDepth(value, bit_depth) ((value)<0 ? 0 : (value)>((1<(high) ? (high) : (value)) #define Sign(value) (((value)<0) ? -1 : ((value)>0) ? 1 : 0) #define abs_value(a) (((a)<0) ? -(a) : (a)) #define libde265_min(a,b) (((a)<(b)) ? (a) : (b)) #define libde265_max(a,b) (((a)>(b)) ? (a) : (b)) LIBDE265_INLINE static int ceil_div(int num,int denom) { num += denom-1; return num/denom; } LIBDE265_INLINE static int ceil_log2(int val) { int n=0; while (val > (1<1) { n++; v>>=1; } return n; } LIBDE265_INLINE static int Log2SizeToArea(int v) { return (1<<(v<<1)); } void copy_subimage(uint8_t* dst,int dststride, const uint8_t* src,int srcstride, int w, int h); // === logging === enum LogModule { LogHighlevel, LogHeaders, LogSlice, LogDPB, LogMotion, LogTransform, LogDeblock, LogSAO, LogSEI, LogIntraPred, LogPixels, LogSymbols, LogCABAC, LogEncoder, LogEncoderMetadata, NUMBER_OF_LogModules }; #if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_TRACE) # define DE265_LOGGING 1 void enable_logging(enum LogModule); void disable_logging(enum LogModule); #else #define enable_logging(x) { } #define disable_logging(x) { } #endif #ifdef DE265_LOGGING void log_set_current_POC(int poc); #else #define log_set_current_POC(poc) { } #endif #ifdef DE265_LOG_ERROR void logerror(enum LogModule module, const char* string, ...); #else #define logerror(...) { } #endif #ifdef DE265_LOG_INFO void loginfo (enum LogModule module, const char* string, ...); #else #define loginfo(...) { } #endif #ifdef DE265_LOG_DEBUG void logdebug(enum LogModule module, const char* string, ...); bool logdebug_enabled(enum LogModule module); #else #define logdebug(...) { } inline bool logdebug_enabled(enum LogModule module) { return false; } #endif #ifdef DE265_LOG_TRACE void logtrace(enum LogModule module, const char* string, ...); #else #define logtrace(...) { } #endif void log2fh(FILE* fh, const char* string, ...); void printBlk(const char* title,const int32_t* data, int blksize, int stride, const std::string& prefix=" "); void printBlk(const char* title,const int16_t* data, int blksize, int stride, const std::string& prefix=" "); void printBlk(const char* title,const uint8_t* data, int blksize, int stride, const std::string& prefix=" "); void debug_set_image_output(void (*)(const struct de265_image*, int slot)); void debug_show_image(const struct de265_image*, int slot); #endif libde265-1.0.18/libde265/visualize.cc000066400000000000000000000435731515675107500170030ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "visualize.h" #include "decctx.h" #include #if 0 void writeFrame_Y(de265_image* img,const char* filename) { int w = ctx->img->get_width(); int h = ctx->img->get_height(); //int c_idx=0; int ctb_size = 64; // HACK int stride = ctx->img->get_luma_stride(); for (int ctbY=0;ctbYcurrent_sps->PicHeightInCtbsY;ctbY++) for (int ctbX=0;ctbXcurrent_sps->PicWidthInCtbsY;ctbX++) { int x0 = ctbX*ctb_size; int y0 = ctbY*ctb_size; uint8_t *src = ctx->img->get_image_plane_at_pos(0,x0,y0); printf("%s %d %d\n",filename,x0,y0); int dx,dy; for (dy=0;dyget_image_plane_at_pos(c, 0,y), de265_get_image_width(img,c), 1, fh); fflush(fh); fclose(fh); } void set_pixel(uint8_t* img, int x,int y, int stride, uint32_t color, int pixelSize) { for (int i=0;i>(i*8)) & 0xFF; img[y*stride + x*pixelSize + i] = col; } } void draw_block_boundary(const de265_image* srcimg, uint8_t* img,int stride, int x,int y,int hBlkSize, int vBlkSize, uint32_t color, int pixelSize) { for (int i=0;iget_sps().pic_height_in_luma_samples) { set_pixel(img,x,yi,stride,color,pixelSize); } } for (int i=0;iget_sps().pic_width_in_luma_samples) { set_pixel(img,xi,y,stride,color,pixelSize); } } } #include "intrapred.h" void draw_intra_pred_mode(const de265_image* srcimg, uint8_t* img,int stride, int x0,int y0,int log2BlkSize, enum IntraPredMode mode, uint32_t color,int pixelSize) { int w = 1< draw square for (int i=-w*1/4;i<=w*1/4;i++) { set_pixel(img, x0+w*1/4, y0+w/2+i,stride, color, pixelSize); set_pixel(img, x0+w*3/4, y0+w/2+i,stride, color, pixelSize); set_pixel(img, x0+w/2+i, y0+w*1/4,stride, color, pixelSize); set_pixel(img, x0+w/2+i, y0+w*3/4,stride, color, pixelSize); } } else if (mode==1) { // DC -> draw circle for (int i=-w/4;i draw line in prediction direction int slope = intraPredAngle_table[mode]; bool horiz = (mode<18); if (horiz) { for (int i=-w/2;i=0 && yget_sps().pic_height_in_luma_samples) { set_pixel(img, x0+i+w/2, y, stride, color, pixelSize); } } } else { for (int i=-w/2;i=0 && xget_sps().pic_width_in_luma_samples) { set_pixel(img, x, y0+i+w/2, stride, color, pixelSize); } } } } } void drawTBgrid(const de265_image* srcimg, uint8_t* img, int stride, int x0,int y0, uint32_t color, int pixelSize, int log2CbSize, int trafoDepth) { int split_transform_flag = srcimg->get_split_transform_flag(x0,y0,trafoDepth); if (split_transform_flag) { int x1 = x0 + ((1<<(log2CbSize-trafoDepth))>>1); int y1 = y0 + ((1<<(log2CbSize-trafoDepth))>>1); drawTBgrid(srcimg,img,stride,x0,y0,color,pixelSize,log2CbSize,trafoDepth+1); drawTBgrid(srcimg,img,stride,x1,y0,color,pixelSize,log2CbSize,trafoDepth+1); drawTBgrid(srcimg,img,stride,x0,y1,color,pixelSize,log2CbSize,trafoDepth+1); drawTBgrid(srcimg,img,stride,x1,y1,color,pixelSize,log2CbSize,trafoDepth+1); } else { draw_block_boundary(srcimg,img,stride,x0,y0,1<<(log2CbSize-trafoDepth),1<<(log2CbSize-trafoDepth), color, pixelSize); } } enum DrawMode { Partitioning_CB, Partitioning_TB, Partitioning_PB, IntraPredMode, PBPredMode, PBMotionVectors, QuantP_Y }; void tint_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize) { for (int y=0;y>(i*8)) & 0xFF; img[yp*stride+xp*pixelSize + i] = (img[yp*stride+xp*pixelSize + i] + col)/2; } } } void fill_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize) { for (int y=0;y>(i*8)) & 0xFF; img[yp*stride+xp*pixelSize + i] = col; } } } void draw_QuantPY_block(const de265_image* srcimg,uint8_t* img,int stride, int x0,int y0, int w,int h, int pixelSize) { int q = srcimg->get_QPY(x0,y0); const int MIN_DRAW_Q = 20; const int MAX_DRAW_Q = 40; if (qMAX_DRAW_Q) q=MAX_DRAW_Q; float f = ((float)q-MIN_DRAW_Q)/(MAX_DRAW_Q-MIN_DRAW_Q); uint32_t col = 0xFF * f; col = col | (col<<8) | (col<<16); fill_rect(img,stride, x0,y0,w,h, col, pixelSize); } void draw_line(uint8_t* img,int stride,uint32_t color,int pixelSize, int width,int height, int x0,int y0,int x1,int y1) { if (x1==x0 && y1==y0) { set_pixel(img,x0,y0,stride,color,pixelSize); } else if (abs(x1-x0) < abs(y1-y0)) { for (int y=y0;y<=y1;y += Sign(y1-y0)) { int x = (y-y0)*(x1-x0)/(y1-y0) + x0; if (x>=0 && x=0 && y=0 && x=0 && yget_pred_mode(x0,y0); uint32_t cols[3] = { 0xff0000, 0x0000ff, 0x00ff00 }; tint_rect(img,stride, x0,y0,w,h, cols[predMode], pixelSize); } else if (what == PBMotionVectors) { const PBMotion& mvi = srcimg->get_mv_info(x0,y0); int x = x0+w/2; int y = y0+h/2; if (mvi.predFlag[0]) { draw_line(img,stride,0xFF0000,pixelSize, srcimg->get_width(), srcimg->get_height(), x,y,x+mvi.mv[0].x,y+mvi.mv[0].y); } if (mvi.predFlag[1]) { draw_line(img,stride,0x00FF00,pixelSize, srcimg->get_width(), srcimg->get_height(), x,y,x+mvi.mv[1].x,y+mvi.mv[1].y); } } } void draw_tree_grid(const de265_image* srcimg, uint8_t* img, int stride, uint32_t color, int pixelSize, enum DrawMode what) { const seq_parameter_set& sps = srcimg->get_sps(); int minCbSize = sps.MinCbSizeY; for (int y0=0;y0get_log2CbSize_cbUnits(x0,y0); if (log2CbSize==0) { continue; } int xb = x0*minCbSize; int yb = y0*minCbSize; int CbSize = 1<get_PartMode(xb,yb); int HalfCbSize = (1<<(log2CbSize-1)); switch (partMode) { case PART_2Nx2N: draw_PB_block(srcimg,img,stride,xb,yb,CbSize,CbSize, what,color,pixelSize); break; case PART_NxN: draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize/2, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize/2, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb ,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize); break; case PART_2NxN: draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/2, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb, yb+HalfCbSize,CbSize ,CbSize/2, what,color,pixelSize); break; case PART_Nx2N: draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize, what,color,pixelSize); break; case PART_2NxnU: draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/4, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb, yb+CbSize/4 ,CbSize ,CbSize*3/4, what,color,pixelSize); break; case PART_2NxnD: draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize*3/4, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb, yb+CbSize*3/4,CbSize ,CbSize/4, what,color,pixelSize); break; case PART_nLx2N: draw_PB_block(srcimg,img,stride,xb, yb, CbSize/4 ,CbSize, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb+CbSize/4 ,yb, CbSize*3/4,CbSize, what,color,pixelSize); break; case PART_nRx2N: draw_PB_block(srcimg,img,stride,xb, yb, CbSize*3/4,CbSize, what,color,pixelSize); draw_PB_block(srcimg,img,stride,xb+CbSize*3/4,yb, CbSize/4 ,CbSize, what,color,pixelSize); break; default: assert(false); break; } } else if (what==IntraPredMode) { enum PredMode predMode = srcimg->get_pred_mode(xb,yb); if (predMode == MODE_INTRA) { enum PartMode partMode = srcimg->get_PartMode(xb,yb); int HalfCbSize = (1<<(log2CbSize-1)); switch (partMode) { case PART_2Nx2N: draw_intra_pred_mode(srcimg,img,stride,xb,yb,log2CbSize, srcimg->get_IntraPredMode(xb,yb), color,pixelSize); break; case PART_NxN: draw_intra_pred_mode(srcimg,img,stride,xb, yb, log2CbSize-1, srcimg->get_IntraPredMode(xb,yb), color,pixelSize); draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb, log2CbSize-1, srcimg->get_IntraPredMode(xb+HalfCbSize,yb), color,pixelSize); draw_intra_pred_mode(srcimg,img,stride,xb ,yb+HalfCbSize,log2CbSize-1, srcimg->get_IntraPredMode(xb,yb+HalfCbSize), color,pixelSize); draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,log2CbSize-1, srcimg->get_IntraPredMode(xb+HalfCbSize,yb+HalfCbSize), color,pixelSize); break; default: assert(false); break; } } } } } LIBDE265_API void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) { draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_CB); } LIBDE265_API void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) { draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_TB); } LIBDE265_API void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) { draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_PB); } LIBDE265_API void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) { draw_tree_grid(img,dst,stride,color,pixelSize, IntraPredMode); } LIBDE265_API void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize) { draw_tree_grid(img,dst,stride,0,pixelSize, PBPredMode); } LIBDE265_API void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize) { draw_tree_grid(img,dst,stride,0,pixelSize, QuantP_Y); } LIBDE265_API void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize) { draw_tree_grid(img,dst,stride,0,pixelSize, PBMotionVectors); } LIBDE265_API void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize) { const seq_parameter_set& sps = img->get_sps(); // --- mark first CTB in slice (red - independent / green - dependent) --- for (int ctby=0;ctby0 || ctby>0) { prevCtbRS = img->get_pps().CtbAddrTStoRS[ img->get_pps().CtbAddrRStoTS[ctbAddrRS] -1 ]; } if (prevCtbRS<0 || img->get_SliceHeaderIndex_atIndex(ctbAddrRS) != img->get_SliceHeaderIndex_atIndex(prevCtbRS)) { int step=2; int fillcolor = 0xFF0000; if (img->get_SliceHeaderCtb(ctbx,ctby)->dependent_slice_segment_flag) { step=2; fillcolor = 0x00FF00; } for (int x=0;x<1<0 && (img->get_SliceHeaderIndexCtb(ctbx ,ctby) != img->get_SliceHeaderIndexCtb(ctbx-1,ctby))) { int x = ctbx << sps.Log2CtbSizeY; int y0 = ctby << sps.Log2CtbSizeY; for (int y=y0; (y0 && (img->get_SliceHeaderIndexCtb(ctbx,ctby ) != img->get_SliceHeaderIndexCtb(ctbx,ctby-1))) { int x0 = ctbx << sps.Log2CtbSizeY; int y = ctby << sps.Log2CtbSizeY; for (int x=x0 ; (xget_sps(); const pic_parameter_set& pps = img->get_pps(); for (int tx=1;tx * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_VISUALIZE_H #define DE265_VISUALIZE_H #include "libde265/de265.h" #include "libde265/image.h" void write_picture_to_file(const de265_image* img, const char* filename); #ifdef __cplusplus extern "C" { #endif // TODO: these should either move to "sherlock265", or be part of the // "official" public API LIBDE265_API void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); LIBDE265_API void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); LIBDE265_API void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); LIBDE265_API void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize); LIBDE265_API void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); LIBDE265_API void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize); LIBDE265_API void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize); LIBDE265_API void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize); LIBDE265_API void draw_Tiles(const de265_image* img, uint8_t* dst, int stride, int pixelSize); #ifdef __cplusplus } #endif #endif libde265-1.0.18/libde265/vps.cc000066400000000000000000000424531515675107500155740ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "vps.h" #include "util.h" #include "decctx.h" #include void profile_data::set_defaults(enum profile_idc profile, int level_major, int level_minor) { profile_present_flag = true; profile_space = 0; tier_flag = false; profile_idc = profile; for (int i=0;i<32;i++) { profile_compatibility_flag[i]=false; } switch (profile) { case Profile_Main: profile_compatibility_flag[Profile_Main]=true; profile_compatibility_flag[Profile_Main10]=true; break; case Profile_Main10: profile_compatibility_flag[Profile_Main10]=true; break; default: assert(0); } progressive_source_flag = false; interlaced_source_flag = false; non_packed_constraint_flag = false; frame_only_constraint_flag = false; // --- level --- level_present_flag = true; level_idc = level_major*30 + level_minor*3; } void video_parameter_set::set_defaults(enum profile_idc profile, int level_major, int level_minor) { video_parameter_set_id = 0; vps_max_layers = 1; // always =1 in current version of standard vps_max_sub_layers = 1; // temporal sub-layers vps_temporal_id_nesting_flag = 1; profile_tier_level_.general.set_defaults(profile,level_major,level_minor); vps_sub_layer_ordering_info_present_flag = 0; layer[0].vps_max_dec_pic_buffering = 1; layer[0].vps_max_num_reorder_pics = 0; layer[0].vps_max_latency_increase = 0; vps_max_layer_id = 0; vps_num_layer_sets = 1; layer_id_included_flag.resize(vps_num_layer_sets); // --- timing info --- vps_timing_info_present_flag = 0; vps_num_units_in_tick = 0; vps_time_scale = 0; vps_poc_proportional_to_timing_flag = 0; vps_num_ticks_poc_diff_one = 0; vps_num_hrd_parameters = 0; // --- vps extension --- vps_extension_flag = 0; } de265_error video_parameter_set::read(error_queue* errqueue, bitreader* reader) { uint32_t vlc; video_parameter_set_id = vlc = reader->get_bits(4); if (vlc >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; reader->skip_bits(2); vps_max_layers = vlc = reader->get_bits(6) +1; if (vlc > 63) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // vps_max_layers_minus1 (range 0...63) vps_max_sub_layers = vlc = reader->get_bits(3) +1; if (vlc >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; vps_temporal_id_nesting_flag = reader->get_bits(1); reader->skip_bits(16); profile_tier_level_.read(reader, vps_max_sub_layers); /* read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info, 0, vps_max_sub_layers-1); */ vps_sub_layer_ordering_info_present_flag = reader->get_bits(1); //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS); int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1); for (int i=firstLayerRead;iget_uvlc(); uint32_t v2 = reader->get_uvlc(); uint32_t v3 = reader->get_uvlc(); if (v1 == UVLC_ERROR || v2 == UVLC_ERROR || v3 == UVLC_ERROR) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } if (v1 > 16 || v2 > v1) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } layer[i].vps_max_dec_pic_buffering = v1; layer[i].vps_max_num_reorder_pics = v2; layer[i].vps_max_latency_increase = v3; } if (!vps_sub_layer_ordering_info_present_flag) { assert(firstLayerRead < MAX_TEMPORAL_SUBLAYERS); for (int i=0;iget_bits(6); if ((vlc = reader->get_uvlc()) == UVLC_ERROR || vlc+1>=1024) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } vps_num_layer_sets = vlc + 1; layer_id_included_flag.resize(vps_num_layer_sets); for (int i=1; i <= vps_num_layer_sets-1; i++) { layer_id_included_flag[i].resize(vps_max_layer_id+1); for (int j=0; j <= vps_max_layer_id; j++) { layer_id_included_flag[i][j] = reader->get_bits(1); } } vps_timing_info_present_flag = reader->get_bits(1); if (vps_timing_info_present_flag) { vps_num_units_in_tick = reader->get_bits(32); vps_time_scale = reader->get_bits(32); vps_poc_proportional_to_timing_flag = reader->get_bits(1); if (vps_poc_proportional_to_timing_flag) { vlc = reader->get_uvlc(); if (vlc == UVLC_ERROR) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } vps_num_ticks_poc_diff_one = vlc + 1; vlc = reader->get_uvlc(); if (vlc == UVLC_ERROR || vlc > vps_num_layer_sets) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } vps_num_hrd_parameters = vlc; hrd_layer_set_idx .resize(vps_num_hrd_parameters); cprms_present_flag.resize(vps_num_hrd_parameters); for (int i=0; iget_uvlc(); if (vlc == UVLC_ERROR || vlc > 1023) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } hrd_layer_set_idx[i] = vlc; if (i > 0) { cprms_present_flag[i] = reader->get_bits(1); } //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) return DE265_OK; // TODO: decode hrd_parameters() } } } vps_extension_flag = reader->get_bits(1); if (vps_extension_flag) { /* while( more_rbsp_data() ) vps_extension_data_flag u(1) rbsp_trailing_bits() */ } return DE265_OK; } de265_error video_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) const { if (video_parameter_set_id >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; out.write_bits(video_parameter_set_id,4); out.write_bits(0x3,2); out.write_bits(vps_max_layers-1,6); if (vps_max_sub_layers >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; out.write_bits(vps_max_sub_layers-1,3); out.write_bit(vps_temporal_id_nesting_flag); out.write_bits(0xFFFF, 16); profile_tier_level_.write(out, vps_max_sub_layers); /* read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info, 0, vps_max_sub_layers-1); */ out.write_bit(vps_sub_layer_ordering_info_present_flag); //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS); int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1); for (int i=firstLayerRead;i=1024) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } out.write_bits(vps_max_layer_id,6); out.write_uvlc(vps_num_layer_sets-1); for (int i=1; i <= vps_num_layer_sets-1; i++) for (int j=0; j <= vps_max_layer_id; j++) { out.write_bit(layer_id_included_flag[i][j]); } out.write_bit(vps_timing_info_present_flag); if (vps_timing_info_present_flag) { out.write_bits(vps_num_units_in_tick,32); out.write_bits(vps_time_scale ,32); out.write_bit (vps_poc_proportional_to_timing_flag); if (vps_poc_proportional_to_timing_flag) { out.write_uvlc(vps_num_ticks_poc_diff_one-1); out.write_uvlc(vps_num_hrd_parameters); for (int i=0; i 0) { out.write_bit(cprms_present_flag[i]); } //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) return DE265_OK; // TODO: decode hrd_parameters() } } } out.write_bit(vps_extension_flag); if (vps_extension_flag) { /* while( more_rbsp_data() ) vps_extension_data_flag u(1) rbsp_trailing_bits() */ } return DE265_OK; } void profile_data::read(bitreader* reader) { if (profile_present_flag) { profile_space = reader->get_bits(2); tier_flag = reader->get_bits(1); profile_idc = (enum profile_idc)reader->get_bits(5); for (int i=0; i<32; i++) { profile_compatibility_flag[i] = reader->get_bits(1); } progressive_source_flag = reader->get_bits(1); interlaced_source_flag = reader->get_bits(1); non_packed_constraint_flag = reader->get_bits(1); frame_only_constraint_flag = reader->get_bits(1); reader->skip_bits(44); } if (level_present_flag) { level_idc = reader->get_bits(8); } } void profile_tier_level::read(bitreader* reader, int max_sub_layers) { // --- read the general profile --- general.profile_present_flag = true; general.level_present_flag = true; general.read(reader); // --- read the profile/levels of the sub-layers --- for (int i=0; iget_bits(1); sub_layer[i].level_present_flag = reader->get_bits(1); } if (max_sub_layers > 1) { for (int i=max_sub_layers-1; i<8; i++) { reader->skip_bits(2); } } for (int i=0; i 1) { for (int i=max_sub_layers-1; i<8; i++) { out.skip_bits(2); } } for (int i=0; ibit_rate_info_present_flag[i] = reader->get_bits(1); hdr->pic_rate_info_present_flag[i] = reader->get_bits(1); if (hdr->bit_rate_info_present_flag[i]) { hdr->avg_bit_rate[i] = reader->get_bits(16); hdr->max_bit_rate[i] = reader->get_bits(16); } if (hdr->pic_rate_info_present_flag[i]) { hdr->constant_pic_rate_idc[i] = reader->get_bits(2); hdr->avg_pic_rate[i] = reader->get_bits(16); } } } */ #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) void video_parameter_set::dump(int fd) const { FILE* fh; if (fd==1) fh=stdout; else if (fd==2) fh=stderr; else { return; } LOG0("----------------- VPS -----------------\n"); LOG1("video_parameter_set_id : %d\n", video_parameter_set_id); LOG1("vps_max_layers : %d\n", vps_max_layers); LOG1("vps_max_sub_layers : %d\n", vps_max_sub_layers); LOG1("vps_temporal_id_nesting_flag : %d\n", vps_temporal_id_nesting_flag); profile_tier_level_.dump(vps_max_sub_layers, fh); //dump_bit_rate_pic_rate_info(&bit_rate_pic_rate_info, 0, vps_max_sub_layers-1); LOG1("vps_sub_layer_ordering_info_present_flag : %d\n", vps_sub_layer_ordering_info_present_flag); if (vps_sub_layer_ordering_info_present_flag) { for (int i=0;i 0) { LOG2("cprms_present_flag[%d] = %d\n", i, cprms_present_flag[i]); } //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) return; // TODO: decode hrd_parameters() } } } LOG1("vps_extension_flag = %d\n", vps_extension_flag); } static const char* profile_name(profile_idc p) { switch (p) { case Profile_Main: return "Main"; case Profile_Main10: return "Main10"; case Profile_MainStillPicture: return "MainStillPicture"; case Profile_FormatRangeExtensions: return "FormatRangeExtensions"; default: return "(unknown)"; } } void profile_data::dump(bool general, FILE* fh) const { const char* prefix = (general ? "general" : "sub_layer"); if (profile_present_flag) { LOG2(" %s_profile_space : %d\n", prefix,profile_space); LOG2(" %s_tier_flag : %d\n", prefix,tier_flag); LOG2(" %s_profile_idc : %s\n", prefix, profile_name(profile_idc)); LOG1(" %s_profile_compatibility_flags: ", prefix); for (int i=0; i<32; i++) { if (i) LOG0("*,"); LOG1("*%d",profile_compatibility_flag[i]); } LOG0("*\n"); LOG2(" %s_progressive_source_flag : %d\n",prefix,progressive_source_flag); LOG2(" %s_interlaced_source_flag : %d\n",prefix,interlaced_source_flag); LOG2(" %s_non_packed_constraint_flag : %d\n",prefix,non_packed_constraint_flag); LOG2(" %s_frame_only_constraint_flag : %d\n",prefix,frame_only_constraint_flag); } if (level_present_flag) { LOG3(" %s_level_idc : %d (%4.2f)\n", prefix,level_idc, level_idc/30.0f); } } void profile_tier_level::dump(int max_sub_layers, FILE* fh) const { general.dump(true, fh); for (int i=0; ibit_rate_info_present_flag[i]) { LOG(" avg_bit_rate : %d\n", hdr->avg_bit_rate[i]); LOG(" max_bit_rate : %d\n", hdr->max_bit_rate[i]); } if (hdr->pic_rate_info_present_flag[i]) { LOG(" constant_pic_rate_idc : %d\n", hdr->constant_pic_rate_idc[i]); LOG(" avg_pic_rate[i] : %d\n", hdr->avg_pic_rate[i]); } } } */ libde265-1.0.18/libde265/vps.h000066400000000000000000000110471515675107500154310ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_VPS_H #define DE265_VPS_H #ifdef HAVE_CONFIG_H #include #endif #include "libde265/bitstream.h" #include "libde265/de265.h" #include "libde265/cabac.h" #include class error_queue; #define MAX_TEMPORAL_SUBLAYERS 8 enum profile_idc { Profile_Main = 1, Profile_Main10 = 2, Profile_MainStillPicture = 3, Profile_FormatRangeExtensions = 4 }; class profile_data { public: void read(bitreader* reader); void write(CABAC_encoder& writer) const; void dump(bool general, FILE* fh) const; void set_defaults(enum profile_idc, int level_major, int level_minor); // --- profile --- bool profile_present_flag; // always true for general profile char profile_space; // currently always 0 bool tier_flag; // main tier or low tier (see Table A-66/A-67) enum profile_idc profile_idc; // profile bool profile_compatibility_flag[32]; // to which profile we are compatible bool progressive_source_flag; bool interlaced_source_flag; bool non_packed_constraint_flag; bool frame_only_constraint_flag; // --- level --- bool level_present_flag; // always true for general level int level_idc; // level * 30 }; class profile_tier_level { public: void read(bitreader* reader, int max_sub_layers); void write(CABAC_encoder& writer, int max_sub_layers) const; void dump(int max_sub_layers, FILE* fh) const; profile_data general; //bool sub_layer_profile_present[MAX_TEMPORAL_SUBLAYERS]; //bool sub_layer_level_present[MAX_TEMPORAL_SUBLAYERS]; profile_data sub_layer[MAX_TEMPORAL_SUBLAYERS]; }; /* struct bit_rate_pic_rate_info { char bit_rate_info_present_flag[8]; char pic_rate_info_present_flag[8]; int avg_bit_rate[8]; int max_bit_rate[8]; char constant_pic_rate_idc[8]; int avg_pic_rate[8]; }; void read_bit_rate_pic_rate_info(bitreader* reader, struct bit_rate_pic_rate_info* hdr, int TempLevelLow, int TempLevelHigh); void dump_bit_rate_pic_rate_info(struct bit_rate_pic_rate_info* hdr, int TempLevelLow, int TempLevelHigh); */ typedef struct { uint8_t vps_max_dec_pic_buffering; // [1 ; max_dpb_size] uint8_t vps_max_num_reorder_pics; // [0 ; vps_max_dec_pic_buffering] uint32_t vps_max_latency_increase; // 0 -> no limit, otherwise value is (x-1) } layer_data; class video_parameter_set { public: de265_error read(error_queue* errqueue, bitreader* reader); de265_error write(error_queue* errqueue, CABAC_encoder& out) const; void dump(int fd) const; void set_defaults(enum profile_idc profile, int level_major, int level_minor); uint8_t video_parameter_set_id; // [0;15] uint8_t vps_max_layers; // [1;?] currently always 1 uint8_t vps_max_sub_layers; // [1;7] number of temporal sub-layers bool vps_temporal_id_nesting_flag; // indicate temporal up-switching always possible profile_tier_level profile_tier_level_; bool vps_sub_layer_ordering_info_present_flag; layer_data layer[MAX_TEMPORAL_SUBLAYERS]; uint8_t vps_max_layer_id; // max value for nuh_layer_id in NALs uint16_t vps_num_layer_sets; // [1;1024], currently always 1 std::vector > layer_id_included_flag; // max size = [1024][64] // --- timing info --- bool vps_timing_info_present_flag; uint32_t vps_num_units_in_tick; uint32_t vps_time_scale; bool vps_poc_proportional_to_timing_flag; uint32_t vps_num_ticks_poc_diff_one; uint16_t vps_num_hrd_parameters; // [0;vps_num_layer_sets] std::vector hrd_layer_set_idx; // max size = 1024 std::vector cprms_present_flag; // max size = 1024 // --- vps extension --- bool vps_extension_flag; }; #endif libde265-1.0.18/libde265/vui.cc000066400000000000000000000340541515675107500155650ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "vui.h" #include "decctx.h" #include #include #include #define READ_VLC(variable, vlctype) \ if ((vlc = br->get_ ## vlctype()) == UVLC_ERROR) { \ errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \ return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \ } \ (variable) = vlc; #define NUM_SAR_PRESETS 17 static uint16_t sar_presets[NUM_SAR_PRESETS+1][2] = { { 0,0 }, { 1,1 }, { 12,11 }, { 10,11 }, { 16,11 }, { 40,33 }, { 24,11 }, { 20,11 }, { 32,11 }, { 80,33 }, { 18,11 }, { 15,11 }, { 64,33 }, { 160,99 }, { 4,3 }, { 3,2 }, { 2,1 } }; #define EXTENDED_SAR 255 const char* get_video_format_name(enum VideoFormat format) { switch (format) { case VideoFormat_Component: return "component"; case VideoFormat_PAL: return "PAL"; case VideoFormat_NTSC: return "NTSC"; case VideoFormat_SECAM: return "SECAM"; case VideoFormat_MAC: return "MAC"; default: return "unspecified"; } } video_usability_information::video_usability_information() = default; de265_error video_usability_information::hrd_parameters(error_queue* errqueue, bitreader* br, const seq_parameter_set* sps) { uint32_t vlc; nal_hrd_parameters_present_flag = br->get_bits(1); vcl_hrd_parameters_present_flag = br->get_bits(1); if (nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag) { sub_pic_hrd_params_present_flag = br->get_bits(1); if (sub_pic_hrd_params_present_flag) { tick_divisor_minus2 = br->get_bits(8); du_cpb_removal_delay_increment_length_minus1 = br->get_bits(5); sub_pic_cpb_params_in_pic_timing_sei_flag = br->get_bits(1); dpb_output_delay_du_length_minus1 = br->get_bits(5); } bit_rate_scale = br->get_bits(4); cpb_size_scale = br->get_bits(4); if (sub_pic_hrd_params_present_flag) { cpb_size_du_scale = br->get_bits(4); } initial_cpb_removal_delay_length_minus1 = br->get_bits(5); au_cpb_removal_delay_length_minus1 = br->get_bits(5); dpb_output_delay_length_minus1 = br->get_bits(5); } int i, nalOrVcl; for (i = 0; i < sps->sps_max_sub_layers; i++) { fixed_pic_rate_general_flag[i] = br->get_bits(1); if (!fixed_pic_rate_general_flag[i]) { fixed_pic_rate_within_cvs_flag[i] = br->get_bits(1); } else { fixed_pic_rate_within_cvs_flag[i] = true; } low_delay_hrd_flag[i] = 0;// Inferred to be 0 when not present cpb_cnt_minus1[i] = 0; // Inferred to be 0 when not present if (fixed_pic_rate_within_cvs_flag[i]) { READ_VLC(elemental_duration_in_tc_minus1[i], uvlc); } else { low_delay_hrd_flag[i] = br->get_bits(1); } if (!low_delay_hrd_flag[i]) { READ_VLC(cpb_cnt_minus1[i], uvlc); if (cpb_cnt_minus1[i] > 31) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } } for (nalOrVcl = 0; nalOrVcl < 2; nalOrVcl++) { if (((nalOrVcl == 0) && nal_hrd_parameters_present_flag) || ((nalOrVcl == 1) && vcl_hrd_parameters_present_flag)) { for (uint32_t j = 0; j <= cpb_cnt_minus1[i]; j++) { READ_VLC(bit_rate_value_minus1[i][j][nalOrVcl], uvlc); READ_VLC(cpb_size_value_minus1[i][j][nalOrVcl], uvlc); if (sub_pic_hrd_params_present_flag) { READ_VLC(cpb_size_du_value_minus1[i][j][nalOrVcl], uvlc); READ_VLC(bit_rate_du_value_minus1[i][j][nalOrVcl], uvlc); } cbr_flag[i][j][nalOrVcl] = br->get_bits(1); } } } } return DE265_OK; } de265_error video_usability_information::read(error_queue* errqueue, bitreader* br, const seq_parameter_set* sps) { uint32_t vlc; // --- sample aspect ratio (SAR) --- aspect_ratio_info_present_flag = br->get_bits(1); if (aspect_ratio_info_present_flag) { int aspect_ratio_idc = br->get_bits(8); if (aspect_ratio_idc <= NUM_SAR_PRESETS) { sar_width = sar_presets[aspect_ratio_idc][0]; sar_height = sar_presets[aspect_ratio_idc][1]; } else if (aspect_ratio_idc == EXTENDED_SAR) { sar_width = br->get_bits(16); sar_height = br->get_bits(16); } else { sar_width = 0; sar_height = 0; } } else { sar_width = 0; sar_height = 0; } // --- overscan --- overscan_info_present_flag = br->get_bits(1); if (overscan_info_present_flag) { overscan_appropriate_flag = br->get_bits(1); } // --- video signal type --- { // defaults video_format = VideoFormat_Unspecified; video_full_range_flag = false; colour_primaries = 2; transfer_characteristics = 2; matrix_coeffs = 2; } video_signal_type_present_flag = br->get_bits(1); if (video_signal_type_present_flag) { int video_format_idc = br->get_bits(3); if (video_format_idc > 5) { video_format_idc = VideoFormat_Unspecified; } video_format = (VideoFormat)video_format_idc; video_full_range_flag = br->get_bits(1); colour_description_present_flag = br->get_bits(1); if (colour_description_present_flag) { colour_primaries = br->get_bits(8); if (colour_primaries == 0 || colour_primaries == 3 || colour_primaries >= 11) { colour_primaries = 2; } transfer_characteristics = br->get_bits(8); if (transfer_characteristics == 0 || transfer_characteristics == 3 || transfer_characteristics >= 18) { transfer_characteristics = 2; } matrix_coeffs = br->get_bits(8); if (matrix_coeffs >= 11) { matrix_coeffs = 2; } } } // --- chroma / interlaced --- chroma_loc_info_present_flag = br->get_bits(1); if (chroma_loc_info_present_flag) { if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 5) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } chroma_sample_loc_type_top_field = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 5) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } chroma_sample_loc_type_bottom_field = vlc; } else { chroma_sample_loc_type_top_field = 0; chroma_sample_loc_type_bottom_field = 0; } neutral_chroma_indication_flag = br->get_bits(1); field_seq_flag = br->get_bits(1); frame_field_info_present_flag = br->get_bits(1); // --- default display window --- default_display_window_flag = br->get_bits(1); if (default_display_window_flag) { READ_VLC(def_disp_win_left_offset, uvlc); READ_VLC(def_disp_win_right_offset, uvlc); READ_VLC(def_disp_win_top_offset, uvlc); READ_VLC(def_disp_win_bottom_offset, uvlc); } else { def_disp_win_left_offset = 0; def_disp_win_right_offset = 0; def_disp_win_top_offset = 0; def_disp_win_bottom_offset = 0; } // --- timing --- vui_timing_info_present_flag = br->get_bits(1); if (vui_timing_info_present_flag) { vui_num_units_in_tick = br->get_bits(32); vui_time_scale = br->get_bits(32); vui_poc_proportional_to_timing_flag = br->get_bits(1); if (vui_poc_proportional_to_timing_flag) { if ((vlc = br->get_uvlc()) == UVLC_ERROR) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } vui_num_ticks_poc_diff_one = vlc + 1; } // --- hrd parameters --- vui_hrd_parameters_present_flag = br->get_bits(1); if (vui_hrd_parameters_present_flag) { de265_error err; err = hrd_parameters(errqueue, br, sps); if (err) { return err; } } } // --- bitstream restriction --- bitstream_restriction_flag = br->get_bits(1); if (bitstream_restriction_flag) { tiles_fixed_structure_flag = br->get_bits(1); motion_vectors_over_pic_boundaries_flag = br->get_bits(1); restricted_ref_pic_lists_flag = br->get_bits(1); if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 4095) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } min_spatial_segmentation_idc = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 16) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } max_bytes_per_pic_denom = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 16) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } max_bits_per_min_cu_denom = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 15) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_max_mv_length_horizontal = vlc; if ((vlc = br->get_uvlc()) == UVLC_ERROR || vlc > 15) { errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } log2_max_mv_length_vertical = vlc; } else { tiles_fixed_structure_flag = false; motion_vectors_over_pic_boundaries_flag = true; restricted_ref_pic_lists_flag = false; // NOTE: default not specified in standard 2014/10 min_spatial_segmentation_idc = 0; max_bytes_per_pic_denom = 2; max_bits_per_min_cu_denom = 1; log2_max_mv_length_horizontal = 15; log2_max_mv_length_vertical = 15; } //vui_read = true; return DE265_OK; } void video_usability_information::dump(int fd) const { //#if (_MSC_VER >= 1500) //#define LOG0(t) loginfo(LogHeaders, t) //#define LOG1(t,d) loginfo(LogHeaders, t,d) //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2) //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3) FILE* fh; if (fd==1) fh=stdout; else if (fd==2) fh=stderr; else { return; } #define LOG0(t) log2fh(fh, t) #define LOG1(t,d) log2fh(fh, t,d) #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) LOG0("----------------- VUI -----------------\n"); LOG2("sample aspect ratio : %d:%d\n", sar_width,sar_height); LOG1("overscan_info_present_flag : %d\n", overscan_info_present_flag); LOG1("overscan_appropriate_flag : %d\n", overscan_appropriate_flag); LOG1("video_signal_type_present_flag: %d\n", video_signal_type_present_flag); if (video_signal_type_present_flag) { LOG1(" video_format : %s\n", get_video_format_name(video_format)); LOG1(" video_full_range_flag : %d\n", video_full_range_flag); LOG1(" colour_description_present_flag : %d\n", colour_description_present_flag); LOG1(" colour_primaries : %d\n", colour_primaries); LOG1(" transfer_characteristics : %d\n", transfer_characteristics); LOG1(" matrix_coeffs : %d\n", matrix_coeffs); } LOG1("chroma_loc_info_present_flag: %d\n", chroma_loc_info_present_flag); if (chroma_loc_info_present_flag) { LOG1(" chroma_sample_loc_type_top_field : %d\n", chroma_sample_loc_type_top_field); LOG1(" chroma_sample_loc_type_bottom_field: %d\n", chroma_sample_loc_type_bottom_field); } LOG1("neutral_chroma_indication_flag: %d\n", neutral_chroma_indication_flag); LOG1("field_seq_flag : %d\n", field_seq_flag); LOG1("frame_field_info_present_flag : %d\n", frame_field_info_present_flag); LOG1("default_display_window_flag : %d\n", default_display_window_flag); LOG1(" def_disp_win_left_offset : %d\n", def_disp_win_left_offset); LOG1(" def_disp_win_right_offset : %d\n", def_disp_win_right_offset); LOG1(" def_disp_win_top_offset : %d\n", def_disp_win_top_offset); LOG1(" def_disp_win_bottom_offset : %d\n", def_disp_win_bottom_offset); LOG1("vui_timing_info_present_flag : %d\n", vui_timing_info_present_flag); if (vui_timing_info_present_flag) { LOG1(" vui_num_units_in_tick : %d\n", vui_num_units_in_tick); LOG1(" vui_time_scale : %d\n", vui_time_scale); } LOG1("vui_poc_proportional_to_timing_flag : %d\n", vui_poc_proportional_to_timing_flag); LOG1("vui_num_ticks_poc_diff_one : %d\n", vui_num_ticks_poc_diff_one); LOG1("vui_hrd_parameters_present_flag : %d\n", vui_hrd_parameters_present_flag); if (vui_hrd_parameters_present_flag) { //hrd_parameters vui_hrd_parameters; } // --- bitstream restriction --- LOG1("bitstream_restriction_flag : %d\n", bitstream_restriction_flag); if (bitstream_restriction_flag) { LOG1(" tiles_fixed_structure_flag : %d\n", tiles_fixed_structure_flag); LOG1(" motion_vectors_over_pic_boundaries_flag : %d\n", motion_vectors_over_pic_boundaries_flag); LOG1(" restricted_ref_pic_lists_flag : %d\n", restricted_ref_pic_lists_flag); LOG1(" min_spatial_segmentation_idc : %d\n", min_spatial_segmentation_idc); LOG1(" max_bytes_per_pic_denom : %d\n", max_bytes_per_pic_denom); LOG1(" max_bits_per_min_cu_denom : %d\n", max_bits_per_min_cu_denom); LOG1(" log2_max_mv_length_horizontal : %d\n", log2_max_mv_length_horizontal); LOG1(" log2_max_mv_length_vertical : %d\n", log2_max_mv_length_vertical); } #undef LOG0 #undef LOG1 #undef LOG2 #undef LOG3 //#endif } libde265-1.0.18/libde265/vui.h000066400000000000000000000105251515675107500154240ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_VUI_H #define DE265_VUI_H #include "libde265/de265.h" #include "libde265/bitstream.h" #include class error_queue; class seq_parameter_set; enum VideoFormat { VideoFormat_Component = 0, VideoFormat_PAL = 1, VideoFormat_NTSC = 2, VideoFormat_SECAM = 3, VideoFormat_MAC = 4, VideoFormat_Unspecified = 5 }; const char* get_video_format_name(enum VideoFormat); class video_usability_information { public: video_usability_information(); de265_error hrd_parameters(error_queue*, bitreader*, const seq_parameter_set*); de265_error read(error_queue*, bitreader*, const seq_parameter_set*); void dump(int fd) const; // --- sample aspect ratio (SAR) --- bool aspect_ratio_info_present_flag = false; uint16_t sar_width = 0; // sar_width and sar_height are zero if unspecified uint16_t sar_height = 0; // --- overscan --- bool overscan_info_present_flag = false; bool overscan_appropriate_flag = false; // --- video signal type --- bool video_signal_type_present_flag = false; enum VideoFormat video_format = VideoFormat_Unspecified; bool video_full_range_flag = false; bool colour_description_present_flag = false; uint8_t colour_primaries = 2; uint8_t transfer_characteristics = 2; uint8_t matrix_coeffs = 2; // --- chroma / interlaced --- bool chroma_loc_info_present_flag = false; uint8_t chroma_sample_loc_type_top_field = 0; uint8_t chroma_sample_loc_type_bottom_field = 0; bool neutral_chroma_indication_flag = false; bool field_seq_flag = false; bool frame_field_info_present_flag = false; // --- default display window --- bool default_display_window_flag = false; uint32_t def_disp_win_left_offset = 0; uint32_t def_disp_win_right_offset = 0; uint32_t def_disp_win_top_offset = 0; uint32_t def_disp_win_bottom_offset = 0; // --- timing --- bool vui_timing_info_present_flag = false; uint32_t vui_num_units_in_tick = 0; uint32_t vui_time_scale = 0; bool vui_poc_proportional_to_timing_flag = false; uint32_t vui_num_ticks_poc_diff_one = 1; // --- hrd parameters --- bool vui_hrd_parameters_present_flag = false; bool nal_hrd_parameters_present_flag; bool vcl_hrd_parameters_present_flag; bool sub_pic_hrd_params_present_flag; uint32_t tick_divisor_minus2; uint32_t du_cpb_removal_delay_increment_length_minus1; bool sub_pic_cpb_params_in_pic_timing_sei_flag; uint32_t dpb_output_delay_du_length_minus1; uint32_t bit_rate_scale; uint32_t cpb_size_scale; uint32_t cpb_size_du_scale; uint32_t initial_cpb_removal_delay_length_minus1; uint32_t au_cpb_removal_delay_length_minus1; uint32_t dpb_output_delay_length_minus1; bool fixed_pic_rate_general_flag[7]; bool fixed_pic_rate_within_cvs_flag[7]; bool low_delay_hrd_flag[7]; uint32_t cpb_cnt_minus1[7]; uint32_t elemental_duration_in_tc_minus1[7]; uint32_t bit_rate_value_minus1[7][32][2]; uint32_t cpb_size_value_minus1[7][32][2]; uint32_t cpb_size_du_value_minus1[7][32][2]; uint32_t bit_rate_du_value_minus1[7][32][2]; bool cbr_flag[7][32][2]; // --- bitstream restriction --- bool bitstream_restriction_flag = false; bool tiles_fixed_structure_flag = false; bool motion_vectors_over_pic_boundaries_flag = true; bool restricted_ref_pic_lists_flag = false; uint16_t min_spatial_segmentation_idc = 0; uint8_t max_bytes_per_pic_denom = 2; uint8_t max_bits_per_min_cu_denom = 1; uint8_t log2_max_mv_length_horizontal = 15; uint8_t log2_max_mv_length_vertical = 15; }; #endif libde265-1.0.18/libde265/x86/000077500000000000000000000000001515675107500150725ustar00rootroot00000000000000libde265-1.0.18/libde265/x86/CMakeLists.txt000066400000000000000000000011301515675107500176250ustar00rootroot00000000000000set (x86_sources sse.cc sse.h ) set (x86_sse_sources sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc ) add_library(x86 OBJECT ${x86_sources}) add_library(x86_sse OBJECT ${x86_sse_sources}) set(sse_flags "") if(NOT MSVC) if(CMAKE_SIZEOF_VOID_P EQUAL 8) set(sse_flags "${sse_flags} -msse4.1") else(CMAKE_SIZEOF_VOID_P EQUAL 8) set(sse_flags "${sse_flags} -msse2 -mssse3 -msse4.1") endif(CMAKE_SIZEOF_VOID_P EQUAL 8) endif() set(X86_OBJECTS $ $ PARENT_SCOPE) SET_TARGET_PROPERTIES(x86_sse PROPERTIES COMPILE_FLAGS "${sse_flags}") libde265-1.0.18/libde265/x86/sse-dct.cc000066400000000000000000012115151515675107500167510ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013 openHEVC contributors * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #include "x86/sse-dct.h" #include "libde265/util.h" #ifdef HAVE_CONFIG_H #include "config.h" #endif #include // SSE2 #include // SSSE3 #if HAVE_SSE4_1 #include // SSE4.1 #endif ALIGNED_16(static const int16_t) transform4x4_luma[8][8] = { { 29, +84, 29, +84, 29, +84, 29, +84 }, { +74, +55, +74, +55, +74, +55, +74, +55 }, { 55, -29, 55, -29, 55, -29, 55, -29 }, { +74, -84, +74, -84, +74, -84, +74, -84 }, { 74, -74, 74, -74, 74, -74, 74, -74 }, { 0, +74, 0, +74, 0, +74, 0, +74 }, { 84, +55, 84, +55, 84, +55, 84, +55 }, { -74, -29, -74, -29, -74, -29, -74, -29 } }; ALIGNED_16(static const int16_t) transform4x4[4][8] = { { 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, -64, 64, -64, 64, -64, 64, -64 }, { 83, 36, 83, 36, 83, 36, 83, 36 }, { 36, -83, 36, -83, 36, -83, 36, -83 } }; ALIGNED_16(static const int16_t) transform8x8[12][8] = { { 89, 75, 89, 75, 89, 75, 89, 75 }, { 50, 18, 50, 18, 50, 18, 50, 18 }, { 75, -18, 75, -18, 75, -18, 75, -18 }, { -89, -50, -89, -50,-89, -50,-89, -50 }, { 50, -89, 50, -89, 50, -89, 50, -89 }, { 18, 75, 18, 75, 18, 75, 18, 75 }, { 18, -50, 18, -50, 18, -50, 18, -50 }, { 75, -89, 75, -89, 75, -89, 75, -89 }, { 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, -64, 64, -64, 64, -64, 64, -64 }, { 83, 36, 83, 36, 83, 36, 83, 36 }, { 36, -83, 36, -83, 36, -83, 36, -83 } }; ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] = { {/*1-3*/ /*2-6*/ { 90, 87, 90, 87, 90, 87, 90, 87 }, { 87, 57, 87, 57, 87, 57, 87, 57 }, { 80, 9, 80, 9, 80, 9, 80, 9 }, { 70, -43, 70, -43, 70, -43, 70, -43 }, { 57, -80, 57, -80, 57, -80, 57, -80 }, { 43, -90, 43, -90, 43, -90, 43, -90 }, { 25, -70, 25, -70, 25, -70, 25, -70 }, { 9, -25, 9, -25, 9, -25, 9, -25 }, },{ /*5-7*/ /*10-14*/ { 80, 70, 80, 70, 80, 70, 80, 70 }, { 9, -43, 9, -43, 9, -43, 9, -43 }, { -70, -87, -70, -87, -70, -87, -70, -87 }, { -87, 9, -87, 9, -87, 9, -87, 9 }, { -25, 90, -25, 90, -25, 90, -25, 90 }, { 57, 25, 57, 25, 57, 25, 57, 25 }, { 90, -80, 90, -80, 90, -80, 90, -80 }, { 43, -57, 43, -57, 43, -57, 43, -57 }, },{ /*9-11*/ /*18-22*/ { 57, 43, 57, 43, 57, 43, 57, 43 }, { -80, -90, -80, -90, -80, -90, -80, -90 }, { -25, 57, -25, 57, -25, 57, -25, 57 }, { 90, 25, 90, 25, 90, 25, 90, 25 }, { -9, -87, -9, -87, -9, -87, -9, -87 }, { -87, 70, -87, 70, -87, 70, -87, 70 }, { 43, 9, 43, 9, 43, 9, 43, 9 }, { 70, -80, 70, -80, 70, -80, 70, -80 }, },{/*13-15*/ /* 26-30 */ { 25, 9, 25, 9, 25, 9, 25, 9 }, { -70, -25, -70, -25, -70, -25, -70, -25 }, { 90, 43, 90, 43, 90, 43, 90, 43 }, { -80, -57, -80, -57, -80, -57, -80, -57 }, { 43, 70, 43, 70, 43, 70, 43, 70 }, { 9, -80, 9, -80, 9, -80, 9, -80 }, { -57, 87, -57, 87, -57, 87, -57, 87 }, { 87, -90, 87, -90, 87, -90, 87, -90 }, } }; ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] = { { /*2-6*/ /*4-12*/ { 89, 75, 89, 75, 89, 75, 89, 75 }, { 75, -18, 75, -18, 75, -18, 75, -18 }, { 50, -89, 50, -89, 50, -89, 50, -89 }, { 18, -50, 18, -50, 18, -50, 18, -50 }, },{ /*10-14*/ /*20-28*/ { 50, 18, 50, 18, 50, 18, 50, 18 }, { -89, -50, -89, -50, -89, -50, -89, -50 }, { 18, 75, 18, 75, 18, 75, 18, 75 }, { 75, -89, 75, -89, 75, -89, 75, -89 }, } }; ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] = { {/*4-12*/ /*8-24*/ { 83, 36, 83, 36, 83, 36, 83, 36 }, { 36, -83, 36, -83, 36, -83, 36, -83 }, },{ /*0-8*/ /*0-16*/ { 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, -64, 64, -64, 64, -64, 64, -64 }, } }; ALIGNED_16(static const int16_t) transform32x32[8][16][8] = { { /* 1-3 */ { 90, 90, 90, 90, 90, 90, 90, 90 }, { 90, 82, 90, 82, 90, 82, 90, 82 }, { 88, 67, 88, 67, 88, 67, 88, 67 }, { 85, 46, 85, 46, 85, 46, 85, 46 }, { 82, 22, 82, 22, 82, 22, 82, 22 }, { 78, -4, 78, -4, 78, -4, 78, -4 }, { 73, -31, 73, -31, 73, -31, 73, -31 }, { 67, -54, 67, -54, 67, -54, 67, -54 }, { 61, -73, 61, -73, 61, -73, 61, -73 }, { 54, -85, 54, -85, 54, -85, 54, -85 }, { 46, -90, 46, -90, 46, -90, 46, -90 }, { 38, -88, 38, -88, 38, -88, 38, -88 }, { 31, -78, 31, -78, 31, -78, 31, -78 }, { 22, -61, 22, -61, 22, -61, 22, -61 }, { 13, -38, 13, -38, 13, -38, 13, -38 }, { 4, -13, 4, -13, 4, -13, 4, -13 }, },{/* 5-7 */ { 88, 85, 88, 85, 88, 85, 88, 85 }, { 67, 46, 67, 46, 67, 46, 67, 46 }, { 31, -13, 31, -13, 31, -13, 31, -13 }, { -13, -67, -13, -67, -13, -67, -13, -67 }, { -54, -90, -54, -90, -54, -90, -54, -90 }, { -82, -73, -82, -73, -82, -73, -82, -73 }, { -90, -22, -90, -22, -90, -22, -90, -22 }, { -78, 38, -78, 38, -78, 38, -78, 38 }, { -46, 82, -46, 82, -46, 82, -46, 82 }, { -4, 88, -4, 88, -4, 88, -4, 88 }, { 38, 54, 38, 54, 38, 54, 38, 54 }, { 73, -4, 73, -4, 73, -4, 73, -4 }, { 90, -61, 90, -61, 90, -61, 90, -61 }, { 85, -90, 85, -90, 85, -90, 85, -90 }, { 61, -78, 61, -78, 61, -78, 61, -78 }, { 22, -31, 22, -31, 22, -31, 22, -31 }, },{/* 9-11 */ { 82, 78, 82, 78, 82, 78, 82, 78 }, { 22, -4, 22, -4, 22, -4, 22, -4 }, { -54, -82, -54, -82, -54, -82, -54, -82 }, { -90, -73, -90, -73, -90, -73, -90, -73 }, { -61, 13, -61, 13, -61, 13, -61, 13 }, { 13, 85, 13, 85, 13, 85, 13, 85 }, { 78, 67, 78, 67, 78, 67, 78, 67 }, { 85, -22, 85, -22, 85, -22, 85, -22 }, { 31, -88, 31, -88, 31, -88, 31, -88 }, { -46, -61, -46, -61, -46, -61, -46, -61 }, { -90, 31, -90, 31, -90, 31, -90, 31 }, { -67, 90, -67, 90, -67, 90, -67, 90 }, { 4, 54, 4, 54, 4, 54, 4, 54 }, { 73, -38, 73, -38, 73, -38, 73, -38 }, { 88, -90, 88, -90, 88, -90, 88, -90 }, { 38, -46, 38, -46, 38, -46, 38, -46 }, },{/* 13-15 */ { 73, 67, 73, 67, 73, 67, 73, 67 }, { -31, -54, -31, -54, -31, -54, -31, -54 }, { -90, -78, -90, -78, -90, -78, -90, -78 }, { -22, 38, -22, 38, -22, 38, -22, 38 }, { 78, 85, 78, 85, 78, 85, 78, 85 }, { 67, -22, 67, -22, 67, -22, 67, -22 }, { -38, -90, -38, -90, -38, -90, -38, -90 }, { -90, 4, -90, 4, -90, 4, -90, 4 }, { -13, 90, -13, 90, -13, 90, -13, 90 }, { 82, 13, 82, 13, 82, 13, 82, 13 }, { 61, -88, 61, -88, 61, -88, 61, -88 }, { -46, -31, -46, -31, -46, -31, -46, -31 }, { -88, 82, -88, 82, -88, 82, -88, 82 }, { -4, 46, -4, 46, -4, 46, -4, 46 }, { 85, -73, 85, -73, 85, -73, 85, -73 }, { 54, -61, 54, -61, 54, -61, 54, -61 }, },{/* 17-19 */ { 61, 54, 61, 54, 61, 54, 61, 54 }, { -73, -85, -73, -85, -73, -85, -73, -85 }, { -46, -4, -46, -4, -46, -4, -46, -4 }, { 82, 88, 82, 88, 82, 88, 82, 88 }, { 31, -46, 31, -46, 31, -46, 31, -46 }, { -88, -61, -88, -61, -88, -61, -88, -61 }, { -13, 82, -13, 82, -13, 82, -13, 82 }, { 90, 13, 90, 13, 90, 13, 90, 13 }, { -4, -90, -4, -90, -4, -90, -4, -90 }, { -90, 38, -90, 38, -90, 38, -90, 38 }, { 22, 67, 22, 67, 22, 67, 22, 67 }, { 85, -78, 85, -78, 85, -78, 85, -78 }, { -38, -22, -38, -22, -38, -22, -38, -22 }, { -78, 90, -78, 90, -78, 90, -78, 90 }, { 54, -31, 54, -31, 54, -31, 54, -31 }, { 67, -73, 67, -73, 67, -73, 67, -73 }, },{ /* 21-23 */ { 46, 38, 46, 38, 46, 38, 46, 38 }, { -90, -88, -90, -88, -90, -88, -90, -88 }, { 38, 73, 38, 73, 38, 73, 38, 73 }, { 54, -4, 54, -4, 54, -4, 54, -4 }, { -90, -67, -90, -67, -90, -67, -90, -67 }, { 31, 90, 31, 90, 31, 90, 31, 90 }, { 61, -46, 61, -46, 61, -46, 61, -46 }, { -88, -31, -88, -31, -88, -31, -88, -31 }, { 22, 85, 22, 85, 22, 85, 22, 85 }, { 67, -78, 67, -78, 67, -78, 67, -78 }, { -85, 13, -85, 13, -85, 13, -85, 13 }, { 13, 61, 13, 61, 13, 61, 13, 61 }, { 73, -90, 73, -90, 73, -90, 73, -90 }, { -82, 54, -82, 54, -82, 54, -82, 54 }, { 4, 22, 4, 22, 4, 22, 4, 22 }, { 78, -82, 78, -82, 78, -82, 78, -82 }, },{ /* 25-27 */ { 31, 22, 31, 22, 31, 22, 31, 22 }, { -78, -61, -78, -61, -78, -61, -78, -61 }, { 90, 85, 90, 85, 90, 85, 90, 85 }, { -61, -90, -61, -90, -61, -90, -61, -90 }, { 4, 73, 4, 73, 4, 73, 4, 73 }, { 54, -38, 54, -38, 54, -38, 54, -38 }, { -88, -4, -88, -4, -88, -4, -88, -4 }, { 82, 46, 82, 46, 82, 46, 82, 46 }, { -38, -78, -38, -78, -38, -78, -38, -78 }, { -22, 90, -22, 90, -22, 90, -22, 90 }, { 73, -82, 73, -82, 73, -82, 73, -82 }, { -90, 54, -90, 54, -90, 54, -90, 54 }, { 67, -13, 67, -13, 67, -13, 67, -13 }, { -13, -31, -13, -31, -13, -31, -13, -31 }, { -46, 67, -46, 67, -46, 67, -46, 67 }, { 85, -88, 85, -88, 85, -88, 85, -88 }, },{/* 29-31 */ { 13, 4, 13, 4, 13, 4, 13, 4 }, { -38, -13, -38, -13, -38, -13, -38, -13 }, { 61, 22, 61, 22, 61, 22, 61, 22 }, { -78, -31, -78, -31, -78, -31, -78, -31 }, { 88, 38, 88, 38, 88, 38, 88, 38 }, { -90, -46, -90, -46, -90, -46, -90, -46 }, { 85, 54, 85, 54, 85, 54, 85, 54 }, { -73, -61, -73, -61, -73, -61, -73, -61 }, { 54, 67, 54, 67, 54, 67, 54, 67 }, { -31, -73, -31, -73, -31, -73, -31, -73 }, { 4, 78, 4, 78, 4, 78, 4, 78 }, { 22, -82, 22, -82, 22, -82, 22, -82 }, { -46, 85, -46, 85, -46, 85, -46, 85 }, { 67, -88, 67, -88, 67, -88, 67, -88 }, { -82, 90, -82, 90, -82, 90, -82, 90 }, { 90, -90, 90, -90, 90, -90, 90, -90 }, } }; #define shift_1st 7 #define add_1st (1 << (shift_1st - 1)) void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { uint8_t *dst = (uint8_t*)_dst; ptrdiff_t stride = _stride; int shift = 5; int offset = 16; __m128i r0,r1,r2,r3,r4,r5,r6,r9; r9= _mm_setzero_si128(); //r8= _mm_set_epi32(0,0,0,-1); r2= _mm_set1_epi16(offset); r0= _mm_load_si128((__m128i*)(coeffs)); r1= _mm_load_si128((__m128i*)(coeffs+8)); r0= _mm_adds_epi16(r0,r2); r1= _mm_adds_epi16(r1,r2); r0= _mm_srai_epi16(r0,shift); r1= _mm_srai_epi16(r1,shift); r3= _mm_loadl_epi64((__m128i*)(dst)); r4= _mm_loadl_epi64((__m128i*)(dst + stride)); r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride)); r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride)); r3= _mm_unpacklo_epi8(r3,r9); r4= _mm_unpacklo_epi8(r4,r9); r5= _mm_unpacklo_epi8(r5,r9); r6= _mm_unpacklo_epi8(r6,r9); r3= _mm_unpacklo_epi64(r3,r4); r4= _mm_unpacklo_epi64(r5,r6); r3= _mm_adds_epi16(r3,r0); r4= _mm_adds_epi16(r4,r1); r3= _mm_packus_epi16(r3,r4); //r8= _mm_set_epi32(0,0,0,-1); //_mm_maskmoveu_si128(r3,r8,(char *) (dst)); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3); r3= _mm_srli_si128(r3,4); //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride)); *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3); r3= _mm_srli_si128(r3,4); //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride)); *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3); r3= _mm_srli_si128(r3,4); //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride)); *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3); } #if HAVE_SSE4_1 void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { uint8_t shift_2nd = 12; // 20 - Bit depth uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) uint8_t *dst = (uint8_t*) _dst; ptrdiff_t stride = _stride; const int16_t *src = coeffs; __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, m128iD; m128iAdd = _mm_set1_epi32(64); S0 = _mm_load_si128((__m128i *) (src)); S8 = _mm_load_si128((__m128i *) (src + 8)); m128iAC = _mm_unpacklo_epi16(S0, S8); m128iBD = _mm_unpackhi_epi16(S0, S8); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[0]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[1]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_1st); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[2]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[3]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_1st); m128iA = _mm_packs_epi32(S0, S8); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[4]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[5]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_1st); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[6]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[7]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_1st); m128iD = _mm_packs_epi32(S0, S8); S0 = _mm_unpacklo_epi16(m128iA, m128iD); S8 = _mm_unpackhi_epi16(m128iA, m128iD); m128iA = _mm_unpacklo_epi16(S0, S8); m128iD = _mm_unpackhi_epi16(S0, S8); /* ################### */ m128iAdd = _mm_set1_epi32(add_2nd); m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[0]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[1]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_2nd); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[2]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[3]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_2nd); m128iA = _mm_packs_epi32(S0, S8); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[4]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[5]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_2nd); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[6]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[7]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_2nd); m128iD = _mm_packs_epi32(S0, S8); // _mm_storeu_si128((__m128i *) (src), m128iA); // _mm_storeu_si128((__m128i *) (src + 8), m128iD); S0 = _mm_move_epi64(m128iA); //contains row 0 S8 = _mm_move_epi64(m128iD); //row 2 m128iA = _mm_srli_si128(m128iA, 8); // row 1 m128iD = _mm_srli_si128(m128iD, 8); // row 3 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); dst += stride; m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); dst += stride; m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(S8, m128iA); m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); dst += stride; m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); } #endif // SSE4.1 #if 0 void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { int i,j; uint8_t shift_2nd = 10; // 20 - Bit depth uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) uint16_t *dst = (uint16_t*) _dst; ptrdiff_t stride = _stride/(sizeof(uint16_t)); int16_t *src = coeffs; __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, m128iD; m128iAdd = _mm_set1_epi32(64); S0 = _mm_loadu_si128((__m128i *) (src)); S8 = _mm_loadu_si128((__m128i *) (src + 8)); m128iAC = _mm_unpacklo_epi16(S0, S8); m128iBD = _mm_unpackhi_epi16(S0, S8); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_loadu_si128((__m128i *) (transform4x4_luma[0]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_loadu_si128((__m128i *) (transform4x4_luma[1]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_1st); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_loadu_si128((__m128i *) (transform4x4_luma[2]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_loadu_si128((__m128i *) (transform4x4_luma[3]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_1st); m128iA = _mm_packs_epi32(S0, S8); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_loadu_si128((__m128i *) (transform4x4_luma[4]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_loadu_si128((__m128i *) (transform4x4_luma[5]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_1st); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_loadu_si128((__m128i *) (transform4x4_luma[6]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_loadu_si128((__m128i *) (transform4x4_luma[7]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_1st); m128iD = _mm_packs_epi32(S0, S8); S0 = _mm_unpacklo_epi16(m128iA, m128iD); S8 = _mm_unpackhi_epi16(m128iA, m128iD); m128iA = _mm_unpacklo_epi16(S0, S8); m128iD = _mm_unpackhi_epi16(S0, S8); /* ################### */ m128iAdd = _mm_set1_epi32(add_2nd); m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[0]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[1]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_2nd); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[2]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[3]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_2nd); m128iA = _mm_packs_epi32(S0, S8); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[4]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[5]))); S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); S0 = _mm_add_epi32(S0, m128iAdd); S0 = _mm_srai_epi32(S0, shift_2nd); m128iTmp1 = _mm_madd_epi16(m128iAC, _mm_load_si128((__m128i *) (transform4x4_luma[6]))); m128iTmp2 = _mm_madd_epi16(m128iBD, _mm_load_si128((__m128i *) (transform4x4_luma[7]))); S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); S8 = _mm_add_epi32(S8, m128iAdd); S8 = _mm_srai_epi32(S8, shift_2nd); m128iD = _mm_packs_epi32(S0, S8); _mm_storeu_si128((__m128i *) (src), m128iA); _mm_storeu_si128((__m128i *) (src + 8), m128iD); j = 0; for (i = 0; i < 2; i++) { dst[0] = av_clip_uintp2(dst[0] + src[j],10); dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); j += 1; dst += stride; dst[0] = av_clip_uintp2(dst[0] + src[j],10); dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); j += 1; dst += stride; } } #endif #if HAVE_SSE4_1 void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { uint8_t shift_2nd = 12; // 20 - Bit depth uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) uint8_t *dst = (uint8_t*) _dst; ptrdiff_t stride = _stride; const int16_t *src = coeffs; __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2; S0 = _mm_load_si128((__m128i *) (src)); S8 = _mm_load_si128((__m128i *) (src + 8)); m128iAdd = _mm_set1_epi32(add_1st); m128Tmp = _mm_unpacklo_epi16(S0, S8); E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); E1 = _mm_add_epi32(E1, m128iAdd); E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); E2 = _mm_add_epi32(E2, m128iAdd); m128Tmp = _mm_unpackhi_epi16(S0, S8); O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); m128iA = _mm_add_epi32(E1, O1); m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum m128Tmp = _mm_add_epi32(E2, O2); m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum m128iA = _mm_packs_epi32(m128iA, m128Tmp); m128iD = _mm_sub_epi32(E2, O2); m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum m128Tmp = _mm_sub_epi32(E1, O1); m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum m128iD = _mm_packs_epi32(m128iD, m128Tmp); S0 = _mm_unpacklo_epi16(m128iA, m128iD); S8 = _mm_unpackhi_epi16(m128iA, m128iD); m128iA = _mm_unpacklo_epi16(S0, S8); m128iD = _mm_unpackhi_epi16(S0, S8); /* ########################## */ m128iAdd = _mm_set1_epi32(add_2nd); m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); E1 = _mm_add_epi32(E1, m128iAdd); E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); E2 = _mm_add_epi32(E2, m128iAdd); m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); m128iA = _mm_add_epi32(E1, O1); m128iA = _mm_srai_epi32(m128iA, shift_2nd); m128Tmp = _mm_add_epi32(E2, O2); m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); m128iA = _mm_packs_epi32(m128iA, m128Tmp); m128iD = _mm_sub_epi32(E2, O2); m128iD = _mm_srai_epi32(m128iD, shift_2nd); m128Tmp = _mm_sub_epi32(E1, O1); m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); m128iD = _mm_packs_epi32(m128iD, m128Tmp); S0 = _mm_move_epi64(m128iA); //contains row 0 S8 = _mm_move_epi64(m128iD); //row 2 m128iA = _mm_srli_si128(m128iA, 8); // row 1 m128iD = _mm_srli_si128(m128iD, 8); // row 3 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); dst += stride; m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); dst += stride; m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(S8, m128iA); m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); dst += stride; m128iA = _mm_loadl_epi64((__m128i *) dst); m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); } #endif #if 0 void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { int i; uint8_t shift_2nd = 10; // 20 - Bit depth uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) uint16_t *dst = (uint16_t*) _dst; ptrdiff_t stride = _stride/2; int16_t *src = coeffs; int j; __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD; S0 = _mm_load_si128((__m128i *) (src)); S8 = _mm_load_si128((__m128i *) (src + 8)); m128iAdd = _mm_set1_epi32(add_1st); m128Tmp = _mm_unpacklo_epi16(S0, S8); E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); E1 = _mm_add_epi32(E1, m128iAdd); E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); E2 = _mm_add_epi32(E2, m128iAdd); m128Tmp = _mm_unpackhi_epi16(S0, S8); O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); m128iA = _mm_add_epi32(E1, O1); m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum m128Tmp = _mm_add_epi32(E2, O2); m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum m128iA = _mm_packs_epi32(m128iA, m128Tmp); m128iD = _mm_sub_epi32(E2, O2); m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum m128Tmp = _mm_sub_epi32(E1, O1); m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum m128iD = _mm_packs_epi32(m128iD, m128Tmp); S0 = _mm_unpacklo_epi16(m128iA, m128iD); S8 = _mm_unpackhi_epi16(m128iA, m128iD); m128iA = _mm_unpacklo_epi16(S0, S8); m128iD = _mm_unpackhi_epi16(S0, S8); /* ########################## */ m128iAdd = _mm_set1_epi32(add_2nd); m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); E1 = _mm_add_epi32(E1, m128iAdd); E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); E2 = _mm_add_epi32(E2, m128iAdd); m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); m128iA = _mm_add_epi32(E1, O1); m128iA = _mm_srai_epi32(m128iA, shift_2nd); m128Tmp = _mm_add_epi32(E2, O2); m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); m128iA = _mm_packs_epi32(m128iA, m128Tmp); m128iD = _mm_sub_epi32(E2, O2); m128iD = _mm_srai_epi32(m128iD, shift_2nd); m128Tmp = _mm_sub_epi32(E1, O1); m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); m128iD = _mm_packs_epi32(m128iD, m128Tmp); _mm_storeu_si128((__m128i *) (src), m128iA); _mm_storeu_si128((__m128i *) (src + 8), m128iD); j = 0; for (i = 0; i < 2; i++) { dst[0] = av_clip_uintp2(dst[0] + src[j],10); dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); j += 1; dst += stride; dst[0] = av_clip_uintp2(dst[0] + src[j],10); dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); j += 1; dst += stride; } } #endif #if HAVE_SSE4_1 void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { uint8_t shift_2nd = 12; // 20 - Bit depth uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) uint8_t *dst = (uint8_t*) _dst; ptrdiff_t stride = _stride / sizeof(uint8_t); const int16_t *src = coeffs; __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h, T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11; T0= _mm_load_si128((__m128i *) (transform8x8[0])); T1= _mm_load_si128((__m128i *) (transform8x8[1])); T2= _mm_load_si128((__m128i *) (transform8x8[2])); T3= _mm_load_si128((__m128i *) (transform8x8[3])); T4= _mm_load_si128((__m128i *) (transform8x8[4])); T5= _mm_load_si128((__m128i *) (transform8x8[5])); T6= _mm_load_si128((__m128i *) (transform8x8[6])); T7= _mm_load_si128((__m128i *) (transform8x8[7])); T8= _mm_load_si128((__m128i *) (transform8x8[8])); T9= _mm_load_si128((__m128i *) (transform8x8[9])); T10= _mm_load_si128((__m128i *) (transform8x8[10])); T11= _mm_load_si128((__m128i *) (transform8x8[11])); m128iAdd = _mm_set1_epi32(add_1st); m128iS1 = _mm_load_si128((__m128i *) (src + 8)); m128iS3 = _mm_load_si128((__m128i *) (src + 24)); m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E1l = _mm_madd_epi16(m128Tmp0, T0); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E1h = _mm_madd_epi16(m128Tmp1, T0); m128iS5 = _mm_load_si128((__m128i *) (src + 40)); m128iS7 = _mm_load_si128((__m128i *) (src + 56)); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E2l = _mm_madd_epi16(m128Tmp2, T1); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E2h = _mm_madd_epi16(m128Tmp3, T1); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, T2); E1h = _mm_madd_epi16(m128Tmp1, T2); E2l = _mm_madd_epi16(m128Tmp2, T3); E2h = _mm_madd_epi16(m128Tmp3, T3); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, T4); E1h = _mm_madd_epi16(m128Tmp1, T4); E2l = _mm_madd_epi16(m128Tmp2, T5); E2h = _mm_madd_epi16(m128Tmp3, T5); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, T6); E1h = _mm_madd_epi16(m128Tmp1, T6); E2l = _mm_madd_epi16(m128Tmp2, T7); E2h = _mm_madd_epi16(m128Tmp3, T7); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ m128iS0 = _mm_load_si128((__m128i *) (src + 0)); m128iS4 = _mm_load_si128((__m128i *) (src + 32)); m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); EE0l = _mm_madd_epi16(m128Tmp0, T8); m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); EE0h = _mm_madd_epi16(m128Tmp1, T8); EE1l = _mm_madd_epi16(m128Tmp0, T9); EE1h = _mm_madd_epi16(m128Tmp1, T9); /* ------- */ m128iS2 = _mm_load_si128((__m128i *) (src + 16)); m128iS6 = _mm_load_si128((__m128i *) (src + 48)); m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E00l = _mm_madd_epi16(m128Tmp0, T10); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E00h = _mm_madd_epi16(m128Tmp1, T10); E01l = _mm_madd_epi16(m128Tmp0, T11); E01h = _mm_madd_epi16(m128Tmp1, T11); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, m128iAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, m128iAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, m128iAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, m128iAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, m128iAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, m128iAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, m128iAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); /* Invers matrix */ E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128iAdd = _mm_set1_epi32(add_2nd); m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E1l = _mm_madd_epi16(m128Tmp0, T0); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E1h = _mm_madd_epi16(m128Tmp1, T0); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E2l = _mm_madd_epi16(m128Tmp2, T1); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E2h = _mm_madd_epi16(m128Tmp3, T1); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, T2); E1h = _mm_madd_epi16(m128Tmp1, T2); E2l = _mm_madd_epi16(m128Tmp2, T3); E2h = _mm_madd_epi16(m128Tmp3, T3); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, T4); E1h = _mm_madd_epi16(m128Tmp1, T4); E2l = _mm_madd_epi16(m128Tmp2, T5); E2h = _mm_madd_epi16(m128Tmp3, T5); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, T6); E1h = _mm_madd_epi16(m128Tmp1, T6); E2l = _mm_madd_epi16(m128Tmp2, T7); E2h = _mm_madd_epi16(m128Tmp3, T7); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); EE0l = _mm_madd_epi16(m128Tmp0, T8); m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); EE0h = _mm_madd_epi16(m128Tmp1, T8); EE1l = _mm_madd_epi16(m128Tmp0, T9); EE1h = _mm_madd_epi16(m128Tmp1, T9); m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E00l = _mm_madd_epi16(m128Tmp0, T10); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E00h = _mm_madd_epi16(m128Tmp1, T10); E01l = _mm_madd_epi16(m128Tmp0, T11); E01h = _mm_madd_epi16(m128Tmp1, T11); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, m128iAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, m128iAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, m128iAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, m128iAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, m128iAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, m128iAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, m128iAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS0); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS1); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS2); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS3); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS4); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS5); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS6); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; E0l = _mm_loadl_epi64((__m128i *) dst); E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); E0l = _mm_adds_epi16(E0l, m128iS7); E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); _mm_storel_epi64((__m128i *) dst, E0l); dst += stride; } #endif #if 0 void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { int i; uint16_t *dst = (uint16_t*) _dst; ptrdiff_t stride = _stride / sizeof(uint16_t); int16_t *src = coeffs; uint8_t shift_2nd = 10; // 20 - Bit depth uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; int j; m128iAdd = _mm_set1_epi32(add_1st); m128iS1 = _mm_load_si128((__m128i *) (src + 8)); m128iS3 = _mm_load_si128((__m128i *) (src + 24)); m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[0]))); m128iS5 = _mm_load_si128((__m128i *) (src + 40)); m128iS7 = _mm_load_si128((__m128i *) (src + 56)); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[2]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[2]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[3]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[4]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[4]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[5]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[6]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[6]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[7]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ m128iS0 = _mm_load_si128((__m128i *) (src + 0)); m128iS4 = _mm_load_si128((__m128i *) (src + 32)); m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[8]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[9]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[9]))); /* ------- */ m128iS2 = _mm_load_si128((__m128i *) (src + 16)); m128iS6 = _mm_load_si128((__m128i *) (src + 48)); m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[10]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[11]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, m128iAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, m128iAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, m128iAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, m128iAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, m128iAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, m128iAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, m128iAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); /* Invers matrix */ E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128iAdd = _mm_set1_epi32(add_2nd); m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[2]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[2]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[3]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[4]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[4]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[5]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[6]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[6]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform8x8[7]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[8]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[9]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[9]))); m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[10]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform8x8[11]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, m128iAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, m128iAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, m128iAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, m128iAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, m128iAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, m128iAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, m128iAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); _mm_store_si128((__m128i *) (src), m128iS0); _mm_store_si128((__m128i *) (src + 8), m128iS1); _mm_store_si128((__m128i *) (src + 16), m128iS2); _mm_store_si128((__m128i *) (src + 24), m128iS3); _mm_store_si128((__m128i *) (src + 32), m128iS4); _mm_store_si128((__m128i *) (src + 40), m128iS5); _mm_store_si128((__m128i *) (src + 48), m128iS6); _mm_store_si128((__m128i *) (src + 56), m128iS7); j = 0; for (i = 0; i < 4; i++) { dst[0] = av_clip_uintp2(dst[0] + src[j],10); dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); j += 1; dst += stride; dst[0] = av_clip_uintp2(dst[0] + src[j],10); dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); j += 1; dst += stride; } } #endif #if HAVE_SSE4_1 void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { uint8_t shift_2nd = 12; // 20 - Bit depth uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) int i; uint8_t *dst = (uint8_t*) _dst; ptrdiff_t stride = _stride / sizeof(uint8_t); const int16_t *src = coeffs; int32_t shift; __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; __m128i E4l, E5l, E6l, E7l; __m128i E4h, E5h, E6h, E7h; __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; /*__m128i T00,T01, T02, T03, T04, T05, T06, T07; __m128i T10,T11, T12, T13, T14, T15, T16, T17; __m128i T20,T21, T22, T23, T24, T25, T26, T27; __m128i T30,T31, T32, T33, T34, T35, T36, T37; __m128i U00,U01, U02, U03, U10, U11, U12, U13; __m128i V00,V01, V10, V11;*/ const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0])); const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1])); const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2])); const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3])); const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4])); const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5])); const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6])); const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7])); const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0])); const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1])); const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2])); const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3])); const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4])); const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5])); const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6])); const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7])); const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0])); const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1])); const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2])); const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3])); const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4])); const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5])); const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6])); const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7])); const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0])); const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1])); const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2])); const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3])); const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4])); const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5])); const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6])); const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7])); const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0])); const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1])); const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2])); const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3])); const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0])); const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1])); const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2])); const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3])); const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0])); const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1])); const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0])); const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1])); int j; m128iS0 = _mm_load_si128((__m128i *) (src)); m128iS1 = _mm_load_si128((__m128i *) (src + 16)); m128iS2 = _mm_load_si128((__m128i *) (src + 32)); m128iS3 = _mm_load_si128((__m128i *) (src + 48)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); m128iS5 = _mm_load_si128((__m128i *) (src + 80)); m128iS6 = _mm_load_si128((__m128i *) (src + 96)); m128iS7 = _mm_load_si128((__m128i *) (src + 112)); m128iS8 = _mm_load_si128((__m128i *) (src + 128)); m128iS9 = _mm_load_si128((__m128i *) (src + 144)); m128iS10 = _mm_load_si128((__m128i *) (src + 160)); m128iS11 = _mm_load_si128((__m128i *) (src + 176)); m128iS12 = _mm_load_si128((__m128i *) (src + 192)); m128iS13 = _mm_load_si128((__m128i *) (src + 208)); m128iS14 = _mm_load_si128((__m128i *) (src + 224)); m128iS15 = _mm_load_si128((__m128i *) (src + 240)); shift = shift_1st; m128iAdd = _mm_set1_epi32(add_1st); for (j = 0; j < 2; j++) { for (i = 0; i < 16; i += 8) { m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E0l = _mm_madd_epi16(m128Tmp0,T00); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E0h = _mm_madd_epi16(m128Tmp1,T00); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E1l = _mm_madd_epi16(m128Tmp2,T10); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E1h = _mm_madd_epi16(m128Tmp3,T10); m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); E2l = _mm_madd_epi16(m128Tmp4,T20); m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); E2h = _mm_madd_epi16(m128Tmp5,T20); m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); E3l = _mm_madd_epi16(m128Tmp6,T30); m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); E3h = _mm_madd_epi16(m128Tmp7,T30); O0l = _mm_add_epi32(E0l, E1l); O0l = _mm_add_epi32(O0l, E2l); O0l = _mm_add_epi32(O0l, E3l); O0h = _mm_add_epi32(E0h, E1h); O0h = _mm_add_epi32(O0h, E2h); O0h = _mm_add_epi32(O0h, E3h); /* Compute O1*/ E0l = _mm_madd_epi16(m128Tmp0,T01); E0h = _mm_madd_epi16(m128Tmp1,T01); E1l = _mm_madd_epi16(m128Tmp2,T11); E1h = _mm_madd_epi16(m128Tmp3,T11); E2l = _mm_madd_epi16(m128Tmp4,T21); E2h = _mm_madd_epi16(m128Tmp5,T21); E3l = _mm_madd_epi16(m128Tmp6,T31); E3h = _mm_madd_epi16(m128Tmp7,T31); O1l = _mm_add_epi32(E0l, E1l); O1l = _mm_add_epi32(O1l, E2l); O1l = _mm_add_epi32(O1l, E3l); O1h = _mm_add_epi32(E0h, E1h); O1h = _mm_add_epi32(O1h, E2h); O1h = _mm_add_epi32(O1h, E3h); /* Compute O2*/ E0l = _mm_madd_epi16(m128Tmp0,T02); E0h = _mm_madd_epi16(m128Tmp1,T02); E1l = _mm_madd_epi16(m128Tmp2,T12); E1h = _mm_madd_epi16(m128Tmp3,T12); E2l = _mm_madd_epi16(m128Tmp4,T22); E2h = _mm_madd_epi16(m128Tmp5,T22); E3l = _mm_madd_epi16(m128Tmp6,T32); E3h = _mm_madd_epi16(m128Tmp7,T32); O2l = _mm_add_epi32(E0l, E1l); O2l = _mm_add_epi32(O2l, E2l); O2l = _mm_add_epi32(O2l, E3l); O2h = _mm_add_epi32(E0h, E1h); O2h = _mm_add_epi32(O2h, E2h); O2h = _mm_add_epi32(O2h, E3h); /* Compute O3*/ E0l = _mm_madd_epi16(m128Tmp0,T03); E0h = _mm_madd_epi16(m128Tmp1,T03); E1l = _mm_madd_epi16(m128Tmp2,T13); E1h = _mm_madd_epi16(m128Tmp3,T13); E2l = _mm_madd_epi16(m128Tmp4,T23); E2h = _mm_madd_epi16(m128Tmp5,T23); E3l = _mm_madd_epi16(m128Tmp6,T33); E3h = _mm_madd_epi16(m128Tmp7,T33); O3l = _mm_add_epi32(E0l, E1l); O3l = _mm_add_epi32(O3l, E2l); O3l = _mm_add_epi32(O3l, E3l); O3h = _mm_add_epi32(E0h, E1h); O3h = _mm_add_epi32(O3h, E2h); O3h = _mm_add_epi32(O3h, E3h); /* Compute O4*/ E0l = _mm_madd_epi16(m128Tmp0,T04); E0h = _mm_madd_epi16(m128Tmp1,T04); E1l = _mm_madd_epi16(m128Tmp2,T14); E1h = _mm_madd_epi16(m128Tmp3,T14); E2l = _mm_madd_epi16(m128Tmp4,T24); E2h = _mm_madd_epi16(m128Tmp5,T24); E3l = _mm_madd_epi16(m128Tmp6,T34); E3h = _mm_madd_epi16(m128Tmp7,T34); O4l = _mm_add_epi32(E0l, E1l); O4l = _mm_add_epi32(O4l, E2l); O4l = _mm_add_epi32(O4l, E3l); O4h = _mm_add_epi32(E0h, E1h); O4h = _mm_add_epi32(O4h, E2h); O4h = _mm_add_epi32(O4h, E3h); /* Compute O5*/ E0l = _mm_madd_epi16(m128Tmp0,T05); E0h = _mm_madd_epi16(m128Tmp1,T05); E1l = _mm_madd_epi16(m128Tmp2,T15); E1h = _mm_madd_epi16(m128Tmp3,T15); E2l = _mm_madd_epi16(m128Tmp4,T25); E2h = _mm_madd_epi16(m128Tmp5,T25); E3l = _mm_madd_epi16(m128Tmp6,T35); E3h = _mm_madd_epi16(m128Tmp7,T35); O5l = _mm_add_epi32(E0l, E1l); O5l = _mm_add_epi32(O5l, E2l); O5l = _mm_add_epi32(O5l, E3l); O5h = _mm_add_epi32(E0h, E1h); O5h = _mm_add_epi32(O5h, E2h); O5h = _mm_add_epi32(O5h, E3h); /* Compute O6*/ E0l = _mm_madd_epi16(m128Tmp0,T06); E0h = _mm_madd_epi16(m128Tmp1,T06); E1l = _mm_madd_epi16(m128Tmp2,T16); E1h = _mm_madd_epi16(m128Tmp3,T16); E2l = _mm_madd_epi16(m128Tmp4,T26); E2h = _mm_madd_epi16(m128Tmp5,T26); E3l = _mm_madd_epi16(m128Tmp6,T36); E3h = _mm_madd_epi16(m128Tmp7,T36); O6l = _mm_add_epi32(E0l, E1l); O6l = _mm_add_epi32(O6l, E2l); O6l = _mm_add_epi32(O6l, E3l); O6h = _mm_add_epi32(E0h, E1h); O6h = _mm_add_epi32(O6h, E2h); O6h = _mm_add_epi32(O6h, E3h); /* Compute O7*/ E0l = _mm_madd_epi16(m128Tmp0,T07); E0h = _mm_madd_epi16(m128Tmp1,T07); E1l = _mm_madd_epi16(m128Tmp2,T17); E1h = _mm_madd_epi16(m128Tmp3,T17); E2l = _mm_madd_epi16(m128Tmp4,T27); E2h = _mm_madd_epi16(m128Tmp5,T27); E3l = _mm_madd_epi16(m128Tmp6,T37); E3h = _mm_madd_epi16(m128Tmp7,T37); O7l = _mm_add_epi32(E0l, E1l); O7l = _mm_add_epi32(O7l, E2l); O7l = _mm_add_epi32(O7l, E3l); O7h = _mm_add_epi32(E0h, E1h); O7h = _mm_add_epi32(O7h, E2h); O7h = _mm_add_epi32(O7h, E3h); /* Compute E0 */ m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E0l = _mm_madd_epi16(m128Tmp0,U00); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E0h = _mm_madd_epi16(m128Tmp1,U00); m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp2,U10)); m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp3,U10)); /* Compute E1 */ E1l = _mm_madd_epi16(m128Tmp0,U01); E1h = _mm_madd_epi16(m128Tmp1,U01); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp2,U11)); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp3,U11)); /* Compute E2 */ E2l = _mm_madd_epi16(m128Tmp0,U02); E2h = _mm_madd_epi16(m128Tmp1,U02); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp2,U12)); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp3,U12)); /* Compute E3 */ E3l = _mm_madd_epi16(m128Tmp0,U03); E3h = _mm_madd_epi16(m128Tmp1,U03); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp2,U13)); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp3,U13)); /* Compute EE0 and EEE */ m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); E00l = _mm_madd_epi16(m128Tmp0,V00); m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); E00h = _mm_madd_epi16(m128Tmp1,V00); m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); EE0l = _mm_madd_epi16(m128Tmp2,V10); m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); EE0h = _mm_madd_epi16(m128Tmp3,V10); E01l = _mm_madd_epi16(m128Tmp0,V01); E01h = _mm_madd_epi16(m128Tmp1,V01); EE1l = _mm_madd_epi16(m128Tmp2,V11); EE1h = _mm_madd_epi16(m128Tmp3,V11); /* Compute EE */ EE2l = _mm_sub_epi32(EE1l, E01l); EE3l = _mm_sub_epi32(EE0l, E00l); EE2h = _mm_sub_epi32(EE1h, E01h); EE3h = _mm_sub_epi32(EE0h, E00h); EE0l = _mm_add_epi32(EE0l, E00l); EE1l = _mm_add_epi32(EE1l, E01l); EE0h = _mm_add_epi32(EE0h, E00h); EE1h = _mm_add_epi32(EE1h, E01h); /* Compute E */ E4l = _mm_sub_epi32(EE3l, E3l); E4l = _mm_add_epi32(E4l, m128iAdd); E5l = _mm_sub_epi32(EE2l, E2l); E5l = _mm_add_epi32(E5l, m128iAdd); E6l = _mm_sub_epi32(EE1l, E1l); E6l = _mm_add_epi32(E6l, m128iAdd); E7l = _mm_sub_epi32(EE0l, E0l); E7l = _mm_add_epi32(E7l, m128iAdd); E4h = _mm_sub_epi32(EE3h, E3h); E4h = _mm_add_epi32(E4h, m128iAdd); E5h = _mm_sub_epi32(EE2h, E2h); E5h = _mm_add_epi32(E5h, m128iAdd); E6h = _mm_sub_epi32(EE1h, E1h); E6h = _mm_add_epi32(E6h, m128iAdd); E7h = _mm_sub_epi32(EE0h, E0h); E7h = _mm_add_epi32(E7h, m128iAdd); E0l = _mm_add_epi32(EE0l, E0l); E0l = _mm_add_epi32(E0l, m128iAdd); E1l = _mm_add_epi32(EE1l, E1l); E1l = _mm_add_epi32(E1l, m128iAdd); E2l = _mm_add_epi32(EE2l, E2l); E2l = _mm_add_epi32(E2l, m128iAdd); E3l = _mm_add_epi32(EE3l, E3l); E3l = _mm_add_epi32(E3l, m128iAdd); E0h = _mm_add_epi32(EE0h, E0h); E0h = _mm_add_epi32(E0h, m128iAdd); E1h = _mm_add_epi32(EE1h, E1h); E1h = _mm_add_epi32(E1h, m128iAdd); E2h = _mm_add_epi32(EE2h, E2h); E2h = _mm_add_epi32(E2h, m128iAdd); E3h = _mm_add_epi32(EE3h, E3h); E3h = _mm_add_epi32(E3h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); m128iS15 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); m128iS14 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); m128iS13 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); m128iS12 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); m128iS11 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); m128iS10 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); m128iS9 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); m128iS8 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); if (!j) { //first pass /* Inverse the matrix */ E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); E0h = _mm_unpackhi_epi16(m128iS0, m128iS8); E1h = _mm_unpackhi_epi16(m128iS1, m128iS9); E2h = _mm_unpackhi_epi16(m128iS2, m128iS10); E3h = _mm_unpackhi_epi16(m128iS3, m128iS11); E4h = _mm_unpackhi_epi16(m128iS4, m128iS12); E5h = _mm_unpackhi_epi16(m128iS5, m128iS13); E6h = _mm_unpackhi_epi16(m128iS6, m128iS14); E7h = _mm_unpackhi_epi16(m128iS7, m128iS15); m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); if (!i) { r0= m128iS0; //0 r1= m128iS1; //16 r2= m128iS2; //32 r3= m128iS3; //48 r4= m128iS4; //64 r5= m128iS5; //80 r6= m128iS6; //96 r7= m128iS7; //112 r8= m128iS8; //128 r9= m128iS9; //144 r10= m128iS10; //160 r11= m128iS11; //176 r12= m128iS12; //192 r13= m128iS13; //208 r14= m128iS14; //224 r15= m128iS15; //240 m128iS0 = _mm_load_si128((__m128i *) (src + 8)); m128iS1 = _mm_load_si128((__m128i *) (src + 24)); m128iS2 = _mm_load_si128((__m128i *) (src + 40)); m128iS3 = _mm_load_si128((__m128i *) (src + 56)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); m128iS5 = _mm_load_si128((__m128i *) (src + 88)); m128iS6 = _mm_load_si128((__m128i *) (src + 104)); m128iS7 = _mm_load_si128((__m128i *) (src + 120)); m128iS8 = _mm_load_si128((__m128i *) (src + 136)); m128iS9 = _mm_load_si128((__m128i *) (src + 152)); m128iS10 = _mm_load_si128((__m128i *) (src + 168)); m128iS11 = _mm_load_si128((__m128i *) (src + 184)); m128iS12 = _mm_load_si128((__m128i *) (src + 200)); m128iS13 = _mm_load_si128((__m128i *) (src + 216)); m128iS14 = _mm_load_si128((__m128i *) (src + 232)); m128iS15 = _mm_load_si128((__m128i *) (src + 248)); } else { r16= m128iS0; //8 r17= m128iS1; //24 r18= m128iS2; //40 r19= m128iS3; //56 r20= m128iS4; //72 r21= m128iS5; //88 r22= m128iS6; //104 r23= m128iS7; //120 r24= m128iS8; //136 r25= m128iS9; //152 r26= m128iS10; //168 r27= m128iS11; //184 r28= m128iS12; //200 r29= m128iS13; //216 r30= m128iS14; //232 r31= m128iS15; //248 //prepare next iteration : m128iS0= r0; m128iS1= r2; m128iS2= r4; m128iS3= r6; m128iS4= r8; m128iS5= r10; m128iS6= r12; m128iS7= r14; m128iS8= r16; m128iS9= r18; m128iS10=r20; m128iS11=r22; m128iS12=r24; m128iS13=r26; m128iS14=r28; m128iS15=r30; shift = shift_2nd; m128iAdd = _mm_set1_epi32(add_2nd); } } else { //transpose half matrix : //instead of having 1 register = 1 half-column, //1 register = 1 half-row. E0l = _mm_unpacklo_epi16(m128iS0, m128iS1); E1l = _mm_unpacklo_epi16(m128iS2, m128iS3); E2l = _mm_unpacklo_epi16(m128iS4, m128iS5); E3l = _mm_unpacklo_epi16(m128iS6, m128iS7); E4l = _mm_unpacklo_epi16(m128iS8, m128iS9); E5l = _mm_unpacklo_epi16(m128iS10, m128iS11); E6l = _mm_unpacklo_epi16(m128iS12, m128iS13); E7l = _mm_unpacklo_epi16(m128iS14, m128iS15); O0l = _mm_unpackhi_epi16(m128iS0, m128iS1); O1l = _mm_unpackhi_epi16(m128iS2, m128iS3); O2l = _mm_unpackhi_epi16(m128iS4, m128iS5); O3l = _mm_unpackhi_epi16(m128iS6, m128iS7); O4l = _mm_unpackhi_epi16(m128iS8, m128iS9); O5l = _mm_unpackhi_epi16(m128iS10, m128iS11); O6l = _mm_unpackhi_epi16(m128iS12, m128iS13); O7l = _mm_unpackhi_epi16(m128iS14, m128iS15); m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l); m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l); m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l); m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l); r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd half 2nd row m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l); m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l); m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l); m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l); r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l); m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l); m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l); m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l); r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l); m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l); m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l); m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l); r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); dst = (uint8_t*) (_dst + (i*stride)); m128Tmp0= _mm_setzero_si128(); m128Tmp1= _mm_load_si128((__m128i*)dst); m128Tmp2= _mm_load_si128((__m128i*)(dst+stride)); m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride)); m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride)); m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride)); m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride)); m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride)); E0l= _mm_load_si128((__m128i*)(dst+7*stride)); r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0)); r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0)); r0= _mm_packus_epi16(r0,r2); r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0)); r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0)); r4= _mm_packus_epi16(r4,r6); r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0)); r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0)); r8= _mm_packus_epi16(r8,r10); r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0)); r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0)); r12= _mm_packus_epi16(r12,r14); r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0)); r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0)); r16= _mm_packus_epi16(r16,r18); r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0)); r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0)); r20= _mm_packus_epi16(r20,r22); r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0)); r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0)); r24= _mm_packus_epi16(r24,r26); r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0)); r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0)); r28= _mm_packus_epi16(r28,r30); _mm_store_si128((__m128i*)dst,r0); _mm_store_si128((__m128i*)(dst+stride),r4); _mm_store_si128((__m128i*)(dst+2*stride),r8); _mm_store_si128((__m128i*)(dst+3*stride),r12); _mm_store_si128((__m128i*)(dst+4*stride),r16); _mm_store_si128((__m128i*)(dst+5*stride),r20); _mm_store_si128((__m128i*)(dst+6*stride),r24); _mm_store_si128((__m128i*)(dst+7*stride),r28); if (!i) { //first half done, can store ! m128iS0= r1; m128iS1= r3; m128iS2= r5; m128iS3= r7; m128iS4= r9; m128iS5= r11; m128iS6= r13; m128iS7= r15; m128iS8= r17; m128iS9= r19; m128iS10=r21; m128iS11=r23; m128iS12=r25; m128iS13=r27; m128iS14=r29; m128iS15=r31; } } } } } #endif #if 0 void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { int i; uint16_t *dst = (uint16_t*) _dst; ptrdiff_t stride = _stride / 2; int16_t *src = coeffs; int32_t shift; uint8_t shift_2nd = 10; //20 - bit depth uint16_t add_2nd = 1 << 9; //shift - 1; __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; __m128i E4l, E5l, E6l, E7l; __m128i E4h, E5h, E6h, E7h; int j; m128iS0 = _mm_load_si128((__m128i *) (src)); m128iS1 = _mm_load_si128((__m128i *) (src + 16)); m128iS2 = _mm_load_si128((__m128i *) (src + 32)); m128iS3 = _mm_load_si128((__m128i *) (src + 48)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); m128iS5 = _mm_load_si128((__m128i *) (src + 80)); m128iS6 = _mm_load_si128((__m128i *) (src + 96)); m128iS7 = _mm_load_si128((__m128i *) (src + 112)); m128iS8 = _mm_load_si128((__m128i *) (src + 128)); m128iS9 = _mm_load_si128((__m128i *) (src + 144)); m128iS10 = _mm_load_si128((__m128i *) (src + 160)); m128iS11 = _mm_load_si128((__m128i *) (src + 176)); m128iS12 = _mm_loadu_si128((__m128i *) (src + 192)); m128iS13 = _mm_load_si128((__m128i *) (src + 208)); m128iS14 = _mm_load_si128((__m128i *) (src + 224)); m128iS15 = _mm_load_si128((__m128i *) (src + 240)); shift = shift_1st; m128iAdd = _mm_set1_epi32(add_1st); for (j = 0; j < 2; j++) { for (i = 0; i < 16; i += 8) { m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); O0l = _mm_add_epi32(E0l, E1l); O0l = _mm_add_epi32(O0l, E2l); O0l = _mm_add_epi32(O0l, E3l); O0h = _mm_add_epi32(E0h, E1h); O0h = _mm_add_epi32(O0h, E2h); O0h = _mm_add_epi32(O0h, E3h); /* Compute O1*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); O1l = _mm_add_epi32(E0l, E1l); O1l = _mm_add_epi32(O1l, E2l); O1l = _mm_add_epi32(O1l, E3l); O1h = _mm_add_epi32(E0h, E1h); O1h = _mm_add_epi32(O1h, E2h); O1h = _mm_add_epi32(O1h, E3h); /* Compute O2*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); O2l = _mm_add_epi32(E0l, E1l); O2l = _mm_add_epi32(O2l, E2l); O2l = _mm_add_epi32(O2l, E3l); O2h = _mm_add_epi32(E0h, E1h); O2h = _mm_add_epi32(O2h, E2h); O2h = _mm_add_epi32(O2h, E3h); /* Compute O3*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); O3l = _mm_add_epi32(E0l, E1l); O3l = _mm_add_epi32(O3l, E2l); O3l = _mm_add_epi32(O3l, E3l); O3h = _mm_add_epi32(E0h, E1h); O3h = _mm_add_epi32(O3h, E2h); O3h = _mm_add_epi32(O3h, E3h); /* Compute O4*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); O4l = _mm_add_epi32(E0l, E1l); O4l = _mm_add_epi32(O4l, E2l); O4l = _mm_add_epi32(O4l, E3l); O4h = _mm_add_epi32(E0h, E1h); O4h = _mm_add_epi32(O4h, E2h); O4h = _mm_add_epi32(O4h, E3h); /* Compute O5*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); O5l = _mm_add_epi32(E0l, E1l); O5l = _mm_add_epi32(O5l, E2l); O5l = _mm_add_epi32(O5l, E3l); O5h = _mm_add_epi32(E0h, E1h); O5h = _mm_add_epi32(O5h, E2h); O5h = _mm_add_epi32(O5h, E3h); /* Compute O6*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); O6l = _mm_add_epi32(E0l, E1l); O6l = _mm_add_epi32(O6l, E2l); O6l = _mm_add_epi32(O6l, E3l); O6h = _mm_add_epi32(E0h, E1h); O6h = _mm_add_epi32(O6h, E2h); O6h = _mm_add_epi32(O6h, E3h); /* Compute O7*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); O7l = _mm_add_epi32(E0l, E1l); O7l = _mm_add_epi32(O7l, E2l); O7l = _mm_add_epi32(O7l, E3l); O7h = _mm_add_epi32(E0h, E1h); O7h = _mm_add_epi32(O7h, E2h); O7h = _mm_add_epi32(O7h, E3h); /* Compute E0 */ m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][0])))); m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][0])))); /* Compute E1 */ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][1])))); /* Compute E2 */ E2l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); E2h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][2])))); /* Compute E3 */ E3l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); E3h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][3])))); /* Compute EE0 and EEE */ m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); EE0l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); EE0h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); EE1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); EE1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); /* Compute EE */ EE2l = _mm_sub_epi32(EE1l, E01l); EE3l = _mm_sub_epi32(EE0l, E00l); EE2h = _mm_sub_epi32(EE1h, E01h); EE3h = _mm_sub_epi32(EE0h, E00h); EE0l = _mm_add_epi32(EE0l, E00l); EE1l = _mm_add_epi32(EE1l, E01l); EE0h = _mm_add_epi32(EE0h, E00h); EE1h = _mm_add_epi32(EE1h, E01h); /* Compute E */ E4l = _mm_sub_epi32(EE3l, E3l); E4l = _mm_add_epi32(E4l, m128iAdd); E5l = _mm_sub_epi32(EE2l, E2l); E5l = _mm_add_epi32(E5l, m128iAdd); E6l = _mm_sub_epi32(EE1l, E1l); E6l = _mm_add_epi32(E6l, m128iAdd); E7l = _mm_sub_epi32(EE0l, E0l); E7l = _mm_add_epi32(E7l, m128iAdd); E4h = _mm_sub_epi32(EE3h, E3h); E4h = _mm_add_epi32(E4h, m128iAdd); E5h = _mm_sub_epi32(EE2h, E2h); E5h = _mm_add_epi32(E5h, m128iAdd); E6h = _mm_sub_epi32(EE1h, E1h); E6h = _mm_add_epi32(E6h, m128iAdd); E7h = _mm_sub_epi32(EE0h, E0h); E7h = _mm_add_epi32(E7h, m128iAdd); E0l = _mm_add_epi32(EE0l, E0l); E0l = _mm_add_epi32(E0l, m128iAdd); E1l = _mm_add_epi32(EE1l, E1l); E1l = _mm_add_epi32(E1l, m128iAdd); E2l = _mm_add_epi32(EE2l, E2l); E2l = _mm_add_epi32(E2l, m128iAdd); E3l = _mm_add_epi32(EE3l, E3l); E3l = _mm_add_epi32(E3l, m128iAdd); E0h = _mm_add_epi32(EE0h, E0h); E0h = _mm_add_epi32(E0h, m128iAdd); E1h = _mm_add_epi32(EE1h, E1h); E1h = _mm_add_epi32(E1h, m128iAdd); E2h = _mm_add_epi32(EE2h, E2h); E2h = _mm_add_epi32(E2h, m128iAdd); E3h = _mm_add_epi32(EE3h, E3h); E3h = _mm_add_epi32(E3h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); m128iS15 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); m128iS14 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); m128iS13 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); m128iS12 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); m128iS11 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); m128iS10 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); m128iS9 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); m128iS8 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); if (!j) { /* Inverse the matrix */ E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); O0l = _mm_unpackhi_epi16(m128iS0, m128iS8); O1l = _mm_unpackhi_epi16(m128iS1, m128iS9); O2l = _mm_unpackhi_epi16(m128iS2, m128iS10); O3l = _mm_unpackhi_epi16(m128iS3, m128iS11); O4l = _mm_unpackhi_epi16(m128iS4, m128iS12); O5l = _mm_unpackhi_epi16(m128iS5, m128iS13); O6l = _mm_unpackhi_epi16(m128iS6, m128iS14); O7l = _mm_unpackhi_epi16(m128iS7, m128iS15); m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l); m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l); m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l); m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l); m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l); m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); /* */ _mm_store_si128((__m128i *) (src + i), m128iS0); _mm_store_si128((__m128i *) (src + 16 + i), m128iS1); _mm_store_si128((__m128i *) (src + 32 + i), m128iS2); _mm_store_si128((__m128i *) (src + 48 + i), m128iS3); _mm_store_si128((__m128i *) (src + 64 + i), m128iS4); _mm_store_si128((__m128i *) (src + 80 + i), m128iS5); _mm_store_si128((__m128i *) (src + 96 + i), m128iS6); _mm_store_si128((__m128i *) (src + 112 + i), m128iS7); _mm_store_si128((__m128i *) (src + 128 + i), m128iS8); _mm_store_si128((__m128i *) (src + 144 + i), m128iS9); _mm_store_si128((__m128i *) (src + 160 + i), m128iS10); _mm_store_si128((__m128i *) (src + 176 + i), m128iS11); _mm_store_si128((__m128i *) (src + 192 + i), m128iS12); _mm_store_si128((__m128i *) (src + 208 + i), m128iS13); _mm_store_si128((__m128i *) (src + 224 + i), m128iS14); _mm_store_si128((__m128i *) (src + 240 + i), m128iS15); if (!i) { m128iS0 = _mm_load_si128((__m128i *) (src + 8)); m128iS1 = _mm_load_si128((__m128i *) (src + 24)); m128iS2 = _mm_load_si128((__m128i *) (src + 40)); m128iS3 = _mm_load_si128((__m128i *) (src + 56)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); m128iS5 = _mm_load_si128((__m128i *) (src + 88)); m128iS6 = _mm_load_si128((__m128i *) (src + 104)); m128iS7 = _mm_load_si128((__m128i *) (src + 120)); m128iS8 = _mm_load_si128((__m128i *) (src + 136)); m128iS9 = _mm_load_si128((__m128i *) (src + 152)); m128iS10 = _mm_load_si128((__m128i *) (src + 168)); m128iS11 = _mm_load_si128((__m128i *) (src + 184)); m128iS12 = _mm_loadu_si128((__m128i *) (src + 200)); m128iS13 = _mm_load_si128((__m128i *) (src + 216)); m128iS14 = _mm_load_si128((__m128i *) (src + 232)); m128iS15 = _mm_load_si128((__m128i *) (src + 248)); } else { m128iS0 = _mm_load_si128((__m128i *) (src)); m128iS1 = _mm_load_si128((__m128i *) (src + 32)); m128iS2 = _mm_load_si128((__m128i *) (src + 64)); m128iS3 = _mm_load_si128((__m128i *) (src + 96)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); m128iS5 = _mm_load_si128((__m128i *) (src + 160)); m128iS6 = _mm_load_si128((__m128i *) (src + 192)); m128iS7 = _mm_load_si128((__m128i *) (src + 224)); m128iS8 = _mm_load_si128((__m128i *) (src + 8)); m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8)); m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8)); m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8)); m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8)); m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8)); m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8)); m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8)); shift = shift_2nd; m128iAdd = _mm_set1_epi32(add_2nd); } } else { int k, m = 0; _mm_storeu_si128((__m128i *) (src), m128iS0); _mm_storeu_si128((__m128i *) (src + 8), m128iS1); _mm_storeu_si128((__m128i *) (src + 32), m128iS2); _mm_storeu_si128((__m128i *) (src + 40), m128iS3); _mm_storeu_si128((__m128i *) (src + 64), m128iS4); _mm_storeu_si128((__m128i *) (src + 72), m128iS5); _mm_storeu_si128((__m128i *) (src + 96), m128iS6); _mm_storeu_si128((__m128i *) (src + 104), m128iS7); _mm_storeu_si128((__m128i *) (src + 128), m128iS8); _mm_storeu_si128((__m128i *) (src + 136), m128iS9); _mm_storeu_si128((__m128i *) (src + 160), m128iS10); _mm_storeu_si128((__m128i *) (src + 168), m128iS11); _mm_storeu_si128((__m128i *) (src + 192), m128iS12); _mm_storeu_si128((__m128i *) (src + 200), m128iS13); _mm_storeu_si128((__m128i *) (src + 224), m128iS14); _mm_storeu_si128((__m128i *) (src + 232), m128iS15); dst = (uint16_t*) _dst + (i * stride); for (k = 0; k < 8; k++) { dst[0] = av_clip_uintp2(dst[0] + src[m],10); dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10); dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10); dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10); dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10); dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10); dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10); dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10); dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10); dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10); dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10); dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10); dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10); dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10); dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10); m += 1; dst += stride; } if (!i) { m128iS0 = _mm_load_si128((__m128i *) (src + 16)); m128iS1 = _mm_load_si128((__m128i *) (src + 48)); m128iS2 = _mm_load_si128((__m128i *) (src + 80)); m128iS3 = _mm_loadu_si128((__m128i *) (src + 112)); m128iS4 = _mm_load_si128((__m128i *) (src + 144)); m128iS5 = _mm_load_si128((__m128i *) (src + 176)); m128iS6 = _mm_load_si128((__m128i *) (src + 208)); m128iS7 = _mm_load_si128((__m128i *) (src + 240)); m128iS8 = _mm_load_si128((__m128i *) (src + 24)); m128iS9 = _mm_load_si128((__m128i *) (src + 56)); m128iS10 = _mm_load_si128((__m128i *) (src + 88)); m128iS11 = _mm_loadu_si128((__m128i *) (src + 120)); m128iS12 = _mm_load_si128((__m128i *) (src + 152)); m128iS13 = _mm_load_si128((__m128i *) (src + 184)); m128iS14 = _mm_load_si128((__m128i *) (src + 216)); m128iS15 = _mm_load_si128((__m128i *) (src + 248)); } } } } } #endif #if HAVE_SSE4_1 void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { uint8_t shift_2nd = 12; // 20 - Bit depth uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) int i, j; uint8_t *dst = (uint8_t*) _dst; ptrdiff_t stride = _stride / sizeof(uint8_t); int shift; const int16_t *src = coeffs; __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, EEE0l, EEE1l, EEE0h, EEE1h; __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, EE4l, EE7h, EE6h, EE5h, EE4h; __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63; __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95; __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127; m128iS0 = _mm_load_si128((__m128i *) (src)); m128iS1 = _mm_load_si128((__m128i *) (src + 32)); m128iS2 = _mm_load_si128((__m128i *) (src + 64)); m128iS3 = _mm_load_si128((__m128i *) (src + 96)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); m128iS5 = _mm_load_si128((__m128i *) (src + 160)); m128iS6 = _mm_load_si128((__m128i *) (src + 192)); m128iS7 = _mm_load_si128((__m128i *) (src + 224)); m128iS8 = _mm_load_si128((__m128i *) (src + 256)); m128iS9 = _mm_load_si128((__m128i *) (src + 288)); m128iS10 = _mm_load_si128((__m128i *) (src + 320)); m128iS11 = _mm_load_si128((__m128i *) (src + 352)); m128iS12 = _mm_load_si128((__m128i *) (src + 384)); m128iS13 = _mm_load_si128((__m128i *) (src + 416)); m128iS14 = _mm_load_si128((__m128i *) (src + 448)); m128iS15 = _mm_load_si128((__m128i *) (src + 480)); m128iS16 = _mm_load_si128((__m128i *) (src + 512)); m128iS17 = _mm_load_si128((__m128i *) (src + 544)); m128iS18 = _mm_load_si128((__m128i *) (src + 576)); m128iS19 = _mm_load_si128((__m128i *) (src + 608)); m128iS20 = _mm_load_si128((__m128i *) (src + 640)); m128iS21 = _mm_load_si128((__m128i *) (src + 672)); m128iS22 = _mm_load_si128((__m128i *) (src + 704)); m128iS23 = _mm_load_si128((__m128i *) (src + 736)); m128iS24 = _mm_load_si128((__m128i *) (src + 768)); m128iS25 = _mm_load_si128((__m128i *) (src + 800)); m128iS26 = _mm_load_si128((__m128i *) (src + 832)); m128iS27 = _mm_load_si128((__m128i *) (src + 864)); m128iS28 = _mm_load_si128((__m128i *) (src + 896)); m128iS29 = _mm_load_si128((__m128i *) (src + 928)); m128iS30 = _mm_load_si128((__m128i *) (src + 960)); m128iS31 = _mm_load_si128((__m128i *) (src + 992)); shift = shift_1st; m128iAdd = _mm_set1_epi32(add_1st); for (j = 0; j < 2; j++) { for (i = 0; i < 32; i += 8) { m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][0]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][0]))); m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][0]))); m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][0]))); m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][0]))); m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][0]))); m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][0]))); m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][0]))); m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][0]))); m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][0]))); m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][0]))); m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][0]))); m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][0]))); m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][0]))); O0l = _mm_add_epi32(E0l, E1l); O0l = _mm_add_epi32(O0l, E2l); O0l = _mm_add_epi32(O0l, E3l); O0l = _mm_add_epi32(O0l, E4l); O0l = _mm_add_epi32(O0l, E5l); O0l = _mm_add_epi32(O0l, E6l); O0l = _mm_add_epi32(O0l, E7l); O0h = _mm_add_epi32(E0h, E1h); O0h = _mm_add_epi32(O0h, E2h); O0h = _mm_add_epi32(O0h, E3h); O0h = _mm_add_epi32(O0h, E4h); O0h = _mm_add_epi32(O0h, E5h); O0h = _mm_add_epi32(O0h, E6h); O0h = _mm_add_epi32(O0h, E7h); /* Compute O1*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][1]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][1]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][1]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][1]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][1]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][1]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][1]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][1]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][1]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][1]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][1]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][1]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][1]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][1]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][1]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][1]))); O1l = _mm_add_epi32(E0l, E1l); O1l = _mm_add_epi32(O1l, E2l); O1l = _mm_add_epi32(O1l, E3l); O1l = _mm_add_epi32(O1l, E4l); O1l = _mm_add_epi32(O1l, E5l); O1l = _mm_add_epi32(O1l, E6l); O1l = _mm_add_epi32(O1l, E7l); O1h = _mm_add_epi32(E0h, E1h); O1h = _mm_add_epi32(O1h, E2h); O1h = _mm_add_epi32(O1h, E3h); O1h = _mm_add_epi32(O1h, E4h); O1h = _mm_add_epi32(O1h, E5h); O1h = _mm_add_epi32(O1h, E6h); O1h = _mm_add_epi32(O1h, E7h); /* Compute O2*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][2]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][2]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][2]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][2]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][2]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][2]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][2]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][2]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][2]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][2]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][2]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][2]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][2]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][2]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][2]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][2]))); O2l = _mm_add_epi32(E0l, E1l); O2l = _mm_add_epi32(O2l, E2l); O2l = _mm_add_epi32(O2l, E3l); O2l = _mm_add_epi32(O2l, E4l); O2l = _mm_add_epi32(O2l, E5l); O2l = _mm_add_epi32(O2l, E6l); O2l = _mm_add_epi32(O2l, E7l); O2h = _mm_add_epi32(E0h, E1h); O2h = _mm_add_epi32(O2h, E2h); O2h = _mm_add_epi32(O2h, E3h); O2h = _mm_add_epi32(O2h, E4h); O2h = _mm_add_epi32(O2h, E5h); O2h = _mm_add_epi32(O2h, E6h); O2h = _mm_add_epi32(O2h, E7h); /* Compute O3*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][3]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][3]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][3]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][3]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][3]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][3]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][3]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][3]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][3]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][3]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][3]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][3]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][3]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][3]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][3]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][3]))); O3l = _mm_add_epi32(E0l, E1l); O3l = _mm_add_epi32(O3l, E2l); O3l = _mm_add_epi32(O3l, E3l); O3l = _mm_add_epi32(O3l, E4l); O3l = _mm_add_epi32(O3l, E5l); O3l = _mm_add_epi32(O3l, E6l); O3l = _mm_add_epi32(O3l, E7l); O3h = _mm_add_epi32(E0h, E1h); O3h = _mm_add_epi32(O3h, E2h); O3h = _mm_add_epi32(O3h, E3h); O3h = _mm_add_epi32(O3h, E4h); O3h = _mm_add_epi32(O3h, E5h); O3h = _mm_add_epi32(O3h, E6h); O3h = _mm_add_epi32(O3h, E7h); /* Compute O4*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][4]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][4]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][4]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][4]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][4]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][4]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][4]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][4]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][4]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][4]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][4]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][4]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][4]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][4]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][4]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][4]))); O4l = _mm_add_epi32(E0l, E1l); O4l = _mm_add_epi32(O4l, E2l); O4l = _mm_add_epi32(O4l, E3l); O4l = _mm_add_epi32(O4l, E4l); O4l = _mm_add_epi32(O4l, E5l); O4l = _mm_add_epi32(O4l, E6l); O4l = _mm_add_epi32(O4l, E7l); O4h = _mm_add_epi32(E0h, E1h); O4h = _mm_add_epi32(O4h, E2h); O4h = _mm_add_epi32(O4h, E3h); O4h = _mm_add_epi32(O4h, E4h); O4h = _mm_add_epi32(O4h, E5h); O4h = _mm_add_epi32(O4h, E6h); O4h = _mm_add_epi32(O4h, E7h); /* Compute O5*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][5]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][5]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][5]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][5]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][5]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][5]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][5]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][5]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][5]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][5]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][5]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][5]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][5]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][5]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][5]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][5]))); O5l = _mm_add_epi32(E0l, E1l); O5l = _mm_add_epi32(O5l, E2l); O5l = _mm_add_epi32(O5l, E3l); O5l = _mm_add_epi32(O5l, E4l); O5l = _mm_add_epi32(O5l, E5l); O5l = _mm_add_epi32(O5l, E6l); O5l = _mm_add_epi32(O5l, E7l); O5h = _mm_add_epi32(E0h, E1h); O5h = _mm_add_epi32(O5h, E2h); O5h = _mm_add_epi32(O5h, E3h); O5h = _mm_add_epi32(O5h, E4h); O5h = _mm_add_epi32(O5h, E5h); O5h = _mm_add_epi32(O5h, E6h); O5h = _mm_add_epi32(O5h, E7h); /* Compute O6*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][6]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][6]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][6]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][6]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][6]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][6]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][6]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][6]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][6]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][6]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][6]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][6]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][6]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][6]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][6]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][6]))); O6l = _mm_add_epi32(E0l, E1l); O6l = _mm_add_epi32(O6l, E2l); O6l = _mm_add_epi32(O6l, E3l); O6l = _mm_add_epi32(O6l, E4l); O6l = _mm_add_epi32(O6l, E5l); O6l = _mm_add_epi32(O6l, E6l); O6l = _mm_add_epi32(O6l, E7l); O6h = _mm_add_epi32(E0h, E1h); O6h = _mm_add_epi32(O6h, E2h); O6h = _mm_add_epi32(O6h, E3h); O6h = _mm_add_epi32(O6h, E4h); O6h = _mm_add_epi32(O6h, E5h); O6h = _mm_add_epi32(O6h, E6h); O6h = _mm_add_epi32(O6h, E7h); /* Compute O7*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][7]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][7]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][7]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][7]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][7]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][7]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][7]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][7]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][7]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][7]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][7]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][7]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][7]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][7]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][7]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][7]))); O7l = _mm_add_epi32(E0l, E1l); O7l = _mm_add_epi32(O7l, E2l); O7l = _mm_add_epi32(O7l, E3l); O7l = _mm_add_epi32(O7l, E4l); O7l = _mm_add_epi32(O7l, E5l); O7l = _mm_add_epi32(O7l, E6l); O7l = _mm_add_epi32(O7l, E7l); O7h = _mm_add_epi32(E0h, E1h); O7h = _mm_add_epi32(O7h, E2h); O7h = _mm_add_epi32(O7h, E3h); O7h = _mm_add_epi32(O7h, E4h); O7h = _mm_add_epi32(O7h, E5h); O7h = _mm_add_epi32(O7h, E6h); O7h = _mm_add_epi32(O7h, E7h); /* Compute O8*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][8]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][8]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][8]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][8]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][8]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][8]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][8]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][8]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][8]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][8]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][8]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][8]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][8]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][8]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][8]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][8]))); O8l = _mm_add_epi32(E0l, E1l); O8l = _mm_add_epi32(O8l, E2l); O8l = _mm_add_epi32(O8l, E3l); O8l = _mm_add_epi32(O8l, E4l); O8l = _mm_add_epi32(O8l, E5l); O8l = _mm_add_epi32(O8l, E6l); O8l = _mm_add_epi32(O8l, E7l); O8h = _mm_add_epi32(E0h, E1h); O8h = _mm_add_epi32(O8h, E2h); O8h = _mm_add_epi32(O8h, E3h); O8h = _mm_add_epi32(O8h, E4h); O8h = _mm_add_epi32(O8h, E5h); O8h = _mm_add_epi32(O8h, E6h); O8h = _mm_add_epi32(O8h, E7h); /* Compute O9*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][9]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][9]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][9]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][9]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][9]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][9]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][9]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][9]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][9]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][9]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][9]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][9]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][9]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][9]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][9]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][9]))); O9l = _mm_add_epi32(E0l, E1l); O9l = _mm_add_epi32(O9l, E2l); O9l = _mm_add_epi32(O9l, E3l); O9l = _mm_add_epi32(O9l, E4l); O9l = _mm_add_epi32(O9l, E5l); O9l = _mm_add_epi32(O9l, E6l); O9l = _mm_add_epi32(O9l, E7l); O9h = _mm_add_epi32(E0h, E1h); O9h = _mm_add_epi32(O9h, E2h); O9h = _mm_add_epi32(O9h, E3h); O9h = _mm_add_epi32(O9h, E4h); O9h = _mm_add_epi32(O9h, E5h); O9h = _mm_add_epi32(O9h, E6h); O9h = _mm_add_epi32(O9h, E7h); /* Compute 10*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][10]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][10]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][10]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][10]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][10]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][10]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][10]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][10]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][10]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][10]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][10]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][10]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][10]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][10]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][10]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][10]))); O10l = _mm_add_epi32(E0l, E1l); O10l = _mm_add_epi32(O10l, E2l); O10l = _mm_add_epi32(O10l, E3l); O10l = _mm_add_epi32(O10l, E4l); O10l = _mm_add_epi32(O10l, E5l); O10l = _mm_add_epi32(O10l, E6l); O10l = _mm_add_epi32(O10l, E7l); O10h = _mm_add_epi32(E0h, E1h); O10h = _mm_add_epi32(O10h, E2h); O10h = _mm_add_epi32(O10h, E3h); O10h = _mm_add_epi32(O10h, E4h); O10h = _mm_add_epi32(O10h, E5h); O10h = _mm_add_epi32(O10h, E6h); O10h = _mm_add_epi32(O10h, E7h); /* Compute 11*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][11]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][11]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][11]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][11]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][11]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][11]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][11]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][11]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][11]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][11]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][11]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][11]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][11]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][11]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][11]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][11]))); O11l = _mm_add_epi32(E0l, E1l); O11l = _mm_add_epi32(O11l, E2l); O11l = _mm_add_epi32(O11l, E3l); O11l = _mm_add_epi32(O11l, E4l); O11l = _mm_add_epi32(O11l, E5l); O11l = _mm_add_epi32(O11l, E6l); O11l = _mm_add_epi32(O11l, E7l); O11h = _mm_add_epi32(E0h, E1h); O11h = _mm_add_epi32(O11h, E2h); O11h = _mm_add_epi32(O11h, E3h); O11h = _mm_add_epi32(O11h, E4h); O11h = _mm_add_epi32(O11h, E5h); O11h = _mm_add_epi32(O11h, E6h); O11h = _mm_add_epi32(O11h, E7h); /* Compute 12*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][12]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][12]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][12]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][12]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][12]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][12]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][12]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][12]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][12]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][12]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][12]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][12]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][12]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][12]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][12]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][12]))); O12l = _mm_add_epi32(E0l, E1l); O12l = _mm_add_epi32(O12l, E2l); O12l = _mm_add_epi32(O12l, E3l); O12l = _mm_add_epi32(O12l, E4l); O12l = _mm_add_epi32(O12l, E5l); O12l = _mm_add_epi32(O12l, E6l); O12l = _mm_add_epi32(O12l, E7l); O12h = _mm_add_epi32(E0h, E1h); O12h = _mm_add_epi32(O12h, E2h); O12h = _mm_add_epi32(O12h, E3h); O12h = _mm_add_epi32(O12h, E4h); O12h = _mm_add_epi32(O12h, E5h); O12h = _mm_add_epi32(O12h, E6h); O12h = _mm_add_epi32(O12h, E7h); /* Compute 13*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][13]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][13]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][13]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][13]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][13]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][13]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][13]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][13]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][13]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][13]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][13]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][13]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][13]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][13]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][13]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][13]))); O13l = _mm_add_epi32(E0l, E1l); O13l = _mm_add_epi32(O13l, E2l); O13l = _mm_add_epi32(O13l, E3l); O13l = _mm_add_epi32(O13l, E4l); O13l = _mm_add_epi32(O13l, E5l); O13l = _mm_add_epi32(O13l, E6l); O13l = _mm_add_epi32(O13l, E7l); O13h = _mm_add_epi32(E0h, E1h); O13h = _mm_add_epi32(O13h, E2h); O13h = _mm_add_epi32(O13h, E3h); O13h = _mm_add_epi32(O13h, E4h); O13h = _mm_add_epi32(O13h, E5h); O13h = _mm_add_epi32(O13h, E6h); O13h = _mm_add_epi32(O13h, E7h); /* Compute O14 */ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][14]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][14]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][14]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][14]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][14]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][14]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][14]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][14]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][14]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][14]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][14]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][14]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][14]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][14]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][14]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][14]))); O14l = _mm_add_epi32(E0l, E1l); O14l = _mm_add_epi32(O14l, E2l); O14l = _mm_add_epi32(O14l, E3l); O14l = _mm_add_epi32(O14l, E4l); O14l = _mm_add_epi32(O14l, E5l); O14l = _mm_add_epi32(O14l, E6l); O14l = _mm_add_epi32(O14l, E7l); O14h = _mm_add_epi32(E0h, E1h); O14h = _mm_add_epi32(O14h, E2h); O14h = _mm_add_epi32(O14h, E3h); O14h = _mm_add_epi32(O14h, E4h); O14h = _mm_add_epi32(O14h, E5h); O14h = _mm_add_epi32(O14h, E6h); O14h = _mm_add_epi32(O14h, E7h); /* Compute O15*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][15]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][15]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][15]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][15]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][15]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][15]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][15]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][15]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][15]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][15]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][15]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][15]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][15]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][15]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][15]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][15]))); O15l = _mm_add_epi32(E0l, E1l); O15l = _mm_add_epi32(O15l, E2l); O15l = _mm_add_epi32(O15l, E3l); O15l = _mm_add_epi32(O15l, E4l); O15l = _mm_add_epi32(O15l, E5l); O15l = _mm_add_epi32(O15l, E6l); O15l = _mm_add_epi32(O15l, E7l); O15h = _mm_add_epi32(E0h, E1h); O15h = _mm_add_epi32(O15h, E2h); O15h = _mm_add_epi32(O15h, E3h); O15h = _mm_add_epi32(O15h, E4h); O15h = _mm_add_epi32(O15h, E5h); O15h = _mm_add_epi32(O15h, E6h); O15h = _mm_add_epi32(O15h, E7h); /* Compute E0 */ m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][0])))); m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][0])))); m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][0])))); m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][0])))); m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][0])))); m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][0])))); /* Compute E1 */ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][1])))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][1])))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][1])))); /* Compute E2 */ E2l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); E2h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][2])))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][2])))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][2])))); /* Compute E3 */ E3l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); E3h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][3])))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][3])))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][3])))); /* Compute E4 */ E4l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); E4h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); E4l = _mm_add_epi32(E4l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][4])))); E4h = _mm_add_epi32(E4h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][4])))); E4l = _mm_add_epi32(E4l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][4])))); E4h = _mm_add_epi32(E4h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][4])))); E4l = _mm_add_epi32(E4l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][4])))); E4h = _mm_add_epi32(E4h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][4])))); /* Compute E3 */ E5l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); E5h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); E5l = _mm_add_epi32(E5l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][5])))); E5h = _mm_add_epi32(E5h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][5])))); E5l = _mm_add_epi32(E5l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][5])))); E5h = _mm_add_epi32(E5h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][5])))); E5l = _mm_add_epi32(E5l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][5])))); E5h = _mm_add_epi32(E5h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][5])))); /* Compute E6 */ E6l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); E6h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); E6l = _mm_add_epi32(E6l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][6])))); E6h = _mm_add_epi32(E6h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][6])))); E6l = _mm_add_epi32(E6l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][6])))); E6h = _mm_add_epi32(E6h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][6])))); E6l = _mm_add_epi32(E6l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][6])))); E6h = _mm_add_epi32(E6h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][6])))); /* Compute E7 */ E7l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); E7h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); E7l = _mm_add_epi32(E7l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][7])))); E7h = _mm_add_epi32(E7h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][7])))); E7l = _mm_add_epi32(E7l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][7])))); E7h = _mm_add_epi32(E7h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][7])))); E7l = _mm_add_epi32(E7l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][7])))); E7h = _mm_add_epi32(E7h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][7])))); /* Compute EE0 and EEE */ m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); E00l = _mm_add_epi32(E00l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][0])))); m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); E00h = _mm_add_epi32(E00h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][0])))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); E01l = _mm_add_epi32(E01l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][1])))); E01h = _mm_add_epi32(E01h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][1])))); E02l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); E02h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); E02l = _mm_add_epi32(E02l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][2])))); E02h = _mm_add_epi32(E02h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][2])))); E03l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); E03h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); E03l = _mm_add_epi32(E03l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][3])))); E03h = _mm_add_epi32(E03h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][3])))); /* Compute EE0 and EEE */ m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); EEE0l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); EEE0h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); EEE1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); EEE1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); /* Compute EE */ EE2l = _mm_sub_epi32(EEE1l, EE1l); EE3l = _mm_sub_epi32(EEE0l, EE0l); EE2h = _mm_sub_epi32(EEE1h, EE1h); EE3h = _mm_sub_epi32(EEE0h, EE0h); EE0l = _mm_add_epi32(EEE0l, EE0l); EE1l = _mm_add_epi32(EEE1l, EE1l); EE0h = _mm_add_epi32(EEE0h, EE0h); EE1h = _mm_add_epi32(EEE1h, EE1h); /**/ EE7l = _mm_sub_epi32(EE0l, E00l); EE6l = _mm_sub_epi32(EE1l, E01l); EE5l = _mm_sub_epi32(EE2l, E02l); EE4l = _mm_sub_epi32(EE3l, E03l); EE7h = _mm_sub_epi32(EE0h, E00h); EE6h = _mm_sub_epi32(EE1h, E01h); EE5h = _mm_sub_epi32(EE2h, E02h); EE4h = _mm_sub_epi32(EE3h, E03h); EE0l = _mm_add_epi32(EE0l, E00l); EE1l = _mm_add_epi32(EE1l, E01l); EE2l = _mm_add_epi32(EE2l, E02l); EE3l = _mm_add_epi32(EE3l, E03l); EE0h = _mm_add_epi32(EE0h, E00h); EE1h = _mm_add_epi32(EE1h, E01h); EE2h = _mm_add_epi32(EE2h, E02h); EE3h = _mm_add_epi32(EE3h, E03h); /* Compute E */ E15l = _mm_sub_epi32(EE0l, E0l); E15l = _mm_add_epi32(E15l, m128iAdd); E14l = _mm_sub_epi32(EE1l, E1l); E14l = _mm_add_epi32(E14l, m128iAdd); E13l = _mm_sub_epi32(EE2l, E2l); E13l = _mm_add_epi32(E13l, m128iAdd); E12l = _mm_sub_epi32(EE3l, E3l); E12l = _mm_add_epi32(E12l, m128iAdd); E11l = _mm_sub_epi32(EE4l, E4l); E11l = _mm_add_epi32(E11l, m128iAdd); E10l = _mm_sub_epi32(EE5l, E5l); E10l = _mm_add_epi32(E10l, m128iAdd); E9l = _mm_sub_epi32(EE6l, E6l); E9l = _mm_add_epi32(E9l, m128iAdd); E8l = _mm_sub_epi32(EE7l, E7l); E8l = _mm_add_epi32(E8l, m128iAdd); E0l = _mm_add_epi32(EE0l, E0l); E0l = _mm_add_epi32(E0l, m128iAdd); E1l = _mm_add_epi32(EE1l, E1l); E1l = _mm_add_epi32(E1l, m128iAdd); E2l = _mm_add_epi32(EE2l, E2l); E2l = _mm_add_epi32(E2l, m128iAdd); E3l = _mm_add_epi32(EE3l, E3l); E3l = _mm_add_epi32(E3l, m128iAdd); E4l = _mm_add_epi32(EE4l, E4l); E4l = _mm_add_epi32(E4l, m128iAdd); E5l = _mm_add_epi32(EE5l, E5l); E5l = _mm_add_epi32(E5l, m128iAdd); E6l = _mm_add_epi32(EE6l, E6l); E6l = _mm_add_epi32(E6l, m128iAdd); E7l = _mm_add_epi32(EE7l, E7l); E7l = _mm_add_epi32(E7l, m128iAdd); E15h = _mm_sub_epi32(EE0h, E0h); E15h = _mm_add_epi32(E15h, m128iAdd); E14h = _mm_sub_epi32(EE1h, E1h); E14h = _mm_add_epi32(E14h, m128iAdd); E13h = _mm_sub_epi32(EE2h, E2h); E13h = _mm_add_epi32(E13h, m128iAdd); E12h = _mm_sub_epi32(EE3h, E3h); E12h = _mm_add_epi32(E12h, m128iAdd); E11h = _mm_sub_epi32(EE4h, E4h); E11h = _mm_add_epi32(E11h, m128iAdd); E10h = _mm_sub_epi32(EE5h, E5h); E10h = _mm_add_epi32(E10h, m128iAdd); E9h = _mm_sub_epi32(EE6h, E6h); E9h = _mm_add_epi32(E9h, m128iAdd); E8h = _mm_sub_epi32(EE7h, E7h); E8h = _mm_add_epi32(E8h, m128iAdd); E0h = _mm_add_epi32(EE0h, E0h); E0h = _mm_add_epi32(E0h, m128iAdd); E1h = _mm_add_epi32(EE1h, E1h); E1h = _mm_add_epi32(E1h, m128iAdd); E2h = _mm_add_epi32(EE2h, E2h); E2h = _mm_add_epi32(E2h, m128iAdd); E3h = _mm_add_epi32(EE3h, E3h); E3h = _mm_add_epi32(E3h, m128iAdd); E4h = _mm_add_epi32(EE4h, E4h); E4h = _mm_add_epi32(E4h, m128iAdd); E5h = _mm_add_epi32(EE5h, E5h); E5h = _mm_add_epi32(E5h, m128iAdd); E6h = _mm_add_epi32(EE6h, E6h); E6h = _mm_add_epi32(E6h, m128iAdd); E7h = _mm_add_epi32(EE7h, E7h); E7h = _mm_add_epi32(E7h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); m128iS8 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); m128iS9 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); m128iS10 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); m128iS11 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); m128iS12 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); m128iS13 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); m128iS14 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); m128iS15 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); m128iS31 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); m128iS30 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); m128iS29 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); m128iS28 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); m128iS27 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); m128iS26 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); m128iS25 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); m128iS24 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); m128iS23 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); m128iS22 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); m128iS21 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); m128iS20 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); m128iS19 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); m128iS18 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); m128iS17 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); m128iS16 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); if (!j) { /* Inverse the matrix */ E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); E0h = _mm_unpacklo_epi16(E0l, E8l); E1h = _mm_unpacklo_epi16(E1l, E9l); E2h = _mm_unpacklo_epi16(E2l, E10l); E3h = _mm_unpacklo_epi16(E3l, E11l); E4h = _mm_unpacklo_epi16(E4l, E12l); E5h = _mm_unpacklo_epi16(E5l, E13l); E6h = _mm_unpacklo_epi16(E6l, E14l); E7h = _mm_unpacklo_epi16(E7l, E15l); E8h = _mm_unpackhi_epi16(E0l, E8l); E9h = _mm_unpackhi_epi16(E1l, E9l); E10h = _mm_unpackhi_epi16(E2l, E10l); E11h = _mm_unpackhi_epi16(E3l, E11l); E12h = _mm_unpackhi_epi16(E4l, E12l); E13h = _mm_unpackhi_epi16(E5l, E13l); E14h = _mm_unpackhi_epi16(E6l, E14l); E15h = _mm_unpackhi_epi16(E7l, E15l); m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); /* */ E0h = _mm_unpacklo_epi16(O0l, O8l); E1h = _mm_unpacklo_epi16(O1l, O9l); E2h = _mm_unpacklo_epi16(O2l, O10l); E3h = _mm_unpacklo_epi16(O3l, O11l); E4h = _mm_unpacklo_epi16(O4l, O12l); E5h = _mm_unpacklo_epi16(O5l, O13l); E6h = _mm_unpacklo_epi16(O6l, O14l); E7h = _mm_unpacklo_epi16(O7l, O15l); E8h = _mm_unpackhi_epi16(O0l, O8l); E9h = _mm_unpackhi_epi16(O1l, O9l); E10h = _mm_unpackhi_epi16(O2l, O10l); E11h = _mm_unpackhi_epi16(O3l, O11l); E12h = _mm_unpackhi_epi16(O4l, O12l); E13h = _mm_unpackhi_epi16(O5l, O13l); E14h = _mm_unpackhi_epi16(O6l, O14l); E15h = _mm_unpackhi_epi16(O7l, O15l); m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); if(i==0){ int k = 8; r0=m128iS0; r1=m128iS1; r2=m128iS2; r3=m128iS3; r4=m128iS4; r5=m128iS5; r6=m128iS6; r7=m128iS7; r8=m128iS8; r9=m128iS9; r10=m128iS10; r11=m128iS11; r12=m128iS12; r13=m128iS13; r14=m128iS14; r15=m128iS15; r16=m128iS16; r17=m128iS17; r18=m128iS18; r19=m128iS19; r20=m128iS20; r21=m128iS21; r22=m128iS22; r23=m128iS23; r24=m128iS24; r25=m128iS25; r26=m128iS26; r27=m128iS27; r28=m128iS28; r29=m128iS29; r30=m128iS30; r31=m128iS31; m128iS0 = _mm_load_si128((__m128i *) (src + k)); m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); }else if(i ==8){ r32=m128iS0; r33=m128iS1; r34=m128iS2; r35=m128iS3; r36=m128iS4; r37=m128iS5; r38=m128iS6; r39=m128iS7; r40=m128iS8; r41=m128iS9; r42=m128iS10; r43=m128iS11; r44=m128iS12; r45=m128iS13; r46=m128iS14; r47=m128iS15; r48=m128iS16; r49=m128iS17; r50=m128iS18; r51=m128iS19; r52=m128iS20; r53=m128iS21; r54=m128iS22; r55=m128iS23; r56=m128iS24; r57=m128iS25; r58=m128iS26; r59=m128iS27; r60=m128iS28; r61=m128iS29; r62=m128iS30; r63=m128iS31; m128iS0 = _mm_load_si128((__m128i *) (src + 16)); m128iS1 = _mm_load_si128((__m128i *) (src + 48)); m128iS2 = _mm_load_si128((__m128i *) (src + 80)); m128iS3 = _mm_load_si128((__m128i *) (src + 112)); m128iS4 = _mm_load_si128((__m128i *) (src + 144)); m128iS5 = _mm_load_si128((__m128i *) (src + 176)); m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16)); m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16)); m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16)); m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16)); m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16)); m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16)); m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16)); m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16)); m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16)); m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16)); m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16)); m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16)); m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16)); m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16)); m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16)); m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16)); m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16)); m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16)); m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16)); m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16)); m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16)); m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16)); m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16)); m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16)); m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16)); m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16)); }else if(i ==16){ r64=m128iS0; r65=m128iS1; r66=m128iS2; r67=m128iS3; r68=m128iS4; r69=m128iS5; r70=m128iS6; r71=m128iS7; r72=m128iS8; r73=m128iS9; r74=m128iS10; r75=m128iS11; r76=m128iS12; r77=m128iS13; r78=m128iS14; r79=m128iS15; r80=m128iS16; r81=m128iS17; r82=m128iS18; r83=m128iS19; r84=m128iS20; r85=m128iS21; r86=m128iS22; r87=m128iS23; r88=m128iS24; r89=m128iS25; r90=m128iS26; r91=m128iS27; r92=m128iS28; r93=m128iS29; r94=m128iS30; r95=m128iS31; m128iS0 = _mm_load_si128((__m128i *) (src + 24)); m128iS1 = _mm_load_si128((__m128i *) (src + 56)); m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24)); m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24)); m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24)); m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24)); m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24)); m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24)); m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24)); m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24)); m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24)); m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24)); m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24)); m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24)); m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24)); m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24)); m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24)); m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24)); m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24)); m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24)); m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24)); m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24)); m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24)); m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24)); m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24)); m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24)); m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24)); m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24)); m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24)); m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24)); m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24)); m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24)); }else{ r96=m128iS0; r97=m128iS1; r98=m128iS2; r99=m128iS3; r100=m128iS4; r101=m128iS5; r102=m128iS6; r103=m128iS7; r104=m128iS8; r105=m128iS9; r106=m128iS10; r107=m128iS11; r108=m128iS12; r109=m128iS13; r110=m128iS14; r111=m128iS15; r112=m128iS16; r113=m128iS17; r114=m128iS18; r115=m128iS19; r116=m128iS20; r117=m128iS21; r118=m128iS22; r119=m128iS23; r120=m128iS24; r121=m128iS25; r122=m128iS26; r123=m128iS27; r124=m128iS28; r125=m128iS29; r126=m128iS30; r127=m128iS31; //load data for next j : m128iS0 = r0; m128iS1 = r4; m128iS2 = r8; m128iS3 = r12; m128iS4 = r16; m128iS5 = r20; m128iS6 = r24; m128iS7 = r28; m128iS8 = r32; m128iS9 = r36; m128iS10 = r40; m128iS11 = r44; m128iS12 = r48; m128iS13 = r52; m128iS14 = r56; m128iS15 = r60; m128iS16 = r64; m128iS17 = r68; m128iS18 = r72; m128iS19 = r76; m128iS20 = r80; m128iS21 = r84; m128iS22 = r88; m128iS23 = r92; m128iS24 = r96; m128iS25 = r100; m128iS26 = r104; m128iS27 = r108; m128iS28 = r112; m128iS29 = r116; m128iS30 = r120; m128iS31 =r124; shift = shift_2nd; m128iAdd = _mm_set1_epi32(add_2nd); } } else { //Transpose Matrix E0l= _mm_unpacklo_epi16(m128iS0,m128iS1); E1l= _mm_unpacklo_epi16(m128iS2,m128iS3); E2l= _mm_unpacklo_epi16(m128iS4,m128iS5); E3l= _mm_unpacklo_epi16(m128iS6,m128iS7); E4l= _mm_unpacklo_epi16(m128iS8,m128iS9); E5l= _mm_unpacklo_epi16(m128iS10,m128iS11); E6l= _mm_unpacklo_epi16(m128iS12,m128iS13); E7l= _mm_unpacklo_epi16(m128iS14,m128iS15); E8l= _mm_unpacklo_epi16(m128iS16,m128iS17); E9l= _mm_unpacklo_epi16(m128iS18,m128iS19); E10l= _mm_unpacklo_epi16(m128iS20,m128iS21); E11l= _mm_unpacklo_epi16(m128iS22,m128iS23); E12l= _mm_unpacklo_epi16(m128iS24,m128iS25); E13l= _mm_unpacklo_epi16(m128iS26,m128iS27); E14l= _mm_unpacklo_epi16(m128iS28,m128iS29); E15l= _mm_unpacklo_epi16(m128iS30,m128iS31); E0h= _mm_unpackhi_epi16(m128iS0,m128iS1); E1h= _mm_unpackhi_epi16(m128iS2,m128iS3); E2h= _mm_unpackhi_epi16(m128iS4,m128iS5); E3h= _mm_unpackhi_epi16(m128iS6,m128iS7); E4h= _mm_unpackhi_epi16(m128iS8,m128iS9); E5h= _mm_unpackhi_epi16(m128iS10,m128iS11); E6h= _mm_unpackhi_epi16(m128iS12,m128iS13); E7h= _mm_unpackhi_epi16(m128iS14,m128iS15); E8h= _mm_unpackhi_epi16(m128iS16,m128iS17); E9h= _mm_unpackhi_epi16(m128iS18,m128iS19); E10h= _mm_unpackhi_epi16(m128iS20,m128iS21); E11h= _mm_unpackhi_epi16(m128iS22,m128iS23); E12h= _mm_unpackhi_epi16(m128iS24,m128iS25); E13h= _mm_unpackhi_epi16(m128iS26,m128iS27); E14h= _mm_unpackhi_epi16(m128iS28,m128iS29); E15h= _mm_unpackhi_epi16(m128iS30,m128iS31); m128Tmp0= _mm_unpacklo_epi32(E0l,E1l); m128Tmp1= _mm_unpacklo_epi32(E2l,E3l); m128Tmp2= _mm_unpacklo_epi32(E4l,E5l); m128Tmp3= _mm_unpacklo_epi32(E6l,E7l); m128Tmp4= _mm_unpacklo_epi32(E8l,E9l); m128Tmp5= _mm_unpacklo_epi32(E10l,E11l); m128Tmp6= _mm_unpacklo_epi32(E12l,E13l); m128Tmp7= _mm_unpacklo_epi32(E14l,E15l); m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row //second row m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter //third row m128Tmp0= _mm_unpackhi_epi32(E0l,E1l); m128Tmp1= _mm_unpackhi_epi32(E2l,E3l); m128Tmp2= _mm_unpackhi_epi32(E4l,E5l); m128Tmp3= _mm_unpackhi_epi32(E6l,E7l); m128Tmp4= _mm_unpackhi_epi32(E8l,E9l); m128Tmp5= _mm_unpackhi_epi32(E10l,E11l); m128Tmp6= _mm_unpackhi_epi32(E12l,E13l); m128Tmp7= _mm_unpackhi_epi32(E14l,E15l); m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter //fourth row m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter //fifth row m128Tmp0= _mm_unpacklo_epi32(E0h,E1h); m128Tmp1= _mm_unpacklo_epi32(E2h,E3h); m128Tmp2= _mm_unpacklo_epi32(E4h,E5h); m128Tmp3= _mm_unpacklo_epi32(E6h,E7h); m128Tmp4= _mm_unpacklo_epi32(E8h,E9h); m128Tmp5= _mm_unpacklo_epi32(E10h,E11h); m128Tmp6= _mm_unpacklo_epi32(E12h,E13h); m128Tmp7= _mm_unpacklo_epi32(E14h,E15h); m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //sixth row m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter //seventh row m128Tmp0= _mm_unpackhi_epi32(E0h,E1h); m128Tmp1= _mm_unpackhi_epi32(E2h,E3h); m128Tmp2= _mm_unpackhi_epi32(E4h,E5h); m128Tmp3= _mm_unpackhi_epi32(E6h,E7h); m128Tmp4= _mm_unpackhi_epi32(E8h,E9h); m128Tmp5= _mm_unpackhi_epi32(E10h,E11h); m128Tmp6= _mm_unpackhi_epi32(E12h,E13h); m128Tmp7= _mm_unpackhi_epi32(E14h,E15h); m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter //last row m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter m128Tmp0=_mm_setzero_si128(); //store dst = (uint8_t*) _dst + i*stride; E0l= _mm_load_si128((__m128i*)dst); //16 values E1l= _mm_load_si128((__m128i*)(dst+16)); E2l= _mm_load_si128((__m128i*)(dst+stride)); E3l= _mm_load_si128((__m128i*)(dst+stride+16)); E4l= _mm_load_si128((__m128i*)(dst+2*stride)); E5l= _mm_load_si128((__m128i*)(dst+2*stride+16)); E6l= _mm_load_si128((__m128i*)(dst+3*stride)); E7l= _mm_load_si128((__m128i*)(dst+3*stride+16)); E8l= _mm_load_si128((__m128i*)(dst+4*stride)); E9l= _mm_load_si128((__m128i*)(dst+4*stride+16)); E10l= _mm_load_si128((__m128i*)(dst+5*stride)); E11l= _mm_load_si128((__m128i*)(dst+5*stride+16)); E12l= _mm_load_si128((__m128i*)(dst+6*stride)); E13l= _mm_load_si128((__m128i*)(dst+6*stride+16)); E14l= _mm_load_si128((__m128i*)(dst+7*stride)); E15l= _mm_load_si128((__m128i*)(dst+7*stride+16)); m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0)); m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0)); m128iS0= _mm_packus_epi16(m128iS0,m128iS1); m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0)); m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0)); m128iS2= _mm_packus_epi16(m128iS2,m128iS3); m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0)); m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0)); m128iS4= _mm_packus_epi16(m128iS4,m128iS5); m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0)); m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0)); m128iS6= _mm_packus_epi16(m128iS6,m128iS7); m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0)); m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0)); m128iS8= _mm_packus_epi16(m128iS8,m128iS9); m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0)); m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0)); m128iS10= _mm_packus_epi16(m128iS10,m128iS11); m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0)); m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0)); m128iS12= _mm_packus_epi16(m128iS12,m128iS13); m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0)); m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0)); m128iS14= _mm_packus_epi16(m128iS14,m128iS15); m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0)); m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0)); m128iS16= _mm_packus_epi16(m128iS16,m128iS17); m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0)); m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0)); m128iS18= _mm_packus_epi16(m128iS18,m128iS19); m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0)); m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0)); m128iS20= _mm_packus_epi16(m128iS20,m128iS21); m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0)); m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0)); m128iS22= _mm_packus_epi16(m128iS22,m128iS23); m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0)); m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0)); m128iS24= _mm_packus_epi16(m128iS24,m128iS25); m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0)); m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0)); m128iS26= _mm_packus_epi16(m128iS26,m128iS27); m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0)); m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0)); m128iS28= _mm_packus_epi16(m128iS28,m128iS29); m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0)); m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0)); m128iS30= _mm_packus_epi16(m128iS30,m128iS31); _mm_store_si128((__m128i*)dst,m128iS0); _mm_store_si128((__m128i*)(dst+16),m128iS2); _mm_store_si128((__m128i*)(dst+stride),m128iS4); _mm_store_si128((__m128i*)(dst+stride+16),m128iS6); _mm_store_si128((__m128i*)(dst+2*stride),m128iS8); _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10); _mm_store_si128((__m128i*)(dst+3*stride),m128iS12); _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14); _mm_store_si128((__m128i*)(dst+4*stride),m128iS16); _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18); _mm_store_si128((__m128i*)(dst+5*stride),m128iS20); _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22); _mm_store_si128((__m128i*)(dst+6*stride),m128iS24); _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26); _mm_store_si128((__m128i*)(dst+7*stride),m128iS28); _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30); if(i==0){ //load next values : m128iS0 = r1; m128iS1 = r5; m128iS2 = r9; m128iS3 = r13; m128iS4 = r17; m128iS5 = r21; m128iS6 = r25; m128iS7 = r29; m128iS8 = r33; m128iS9 = r37; m128iS10 = r41; m128iS11 = r45; m128iS12 = r49; m128iS13 = r53; m128iS14 = r57; m128iS15 = r61; m128iS16 = r65; m128iS17 = r69; m128iS18 = r73; m128iS19 = r77; m128iS20 = r81; m128iS21 = r85; m128iS22 = r89; m128iS23 = r93; m128iS24 = r97; m128iS25 = r101; m128iS26 = r105; m128iS27 = r109; m128iS28 = r113; m128iS29 = r117; m128iS30 = r121; m128iS31 =r125; }else if(i ==8){ //load next values : m128iS0 = r2; m128iS1 = r6; m128iS2 = r10; m128iS3 = r14; m128iS4 = r18; m128iS5 = r22; m128iS6 = r26; m128iS7 = r30; m128iS8 = r34; m128iS9 = r38; m128iS10 = r42; m128iS11 = r46; m128iS12 = r50; m128iS13 = r54; m128iS14 = r58; m128iS15 = r62; m128iS16 = r66; m128iS17 = r70; m128iS18 = r74; m128iS19 = r78; m128iS20 = r82; m128iS21 = r86; m128iS22 = r90; m128iS23 = r94; m128iS24 = r98; m128iS25 = r102; m128iS26 = r106; m128iS27 = r110; m128iS28 = r114; m128iS29 = r118; m128iS30 = r122; m128iS31 =r126; }else if(i==16) { //load next values : m128iS0 = r3; m128iS1 = r7; m128iS2 = r11; m128iS3 = r15; m128iS4 = r19; m128iS5 = r23; m128iS6 = r27; m128iS7 = r31; m128iS8 = r35; m128iS9 = r39; m128iS10 = r43; m128iS11 = r47; m128iS12 = r51; m128iS13 = r55; m128iS14 = r59; m128iS15 = r63; m128iS16 = r67; m128iS17 = r71; m128iS18 = r75; m128iS19 = r79; m128iS20 = r83; m128iS21 = r87; m128iS22 = r91; m128iS23 = r95; m128iS24 = r99; m128iS25 = r103; m128iS26 = r107; m128iS27 = r111; m128iS28 = r115; m128iS29 = r119; m128iS30 = r123; m128iS31 =r127; } } } } } #endif #if 0 void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) { int i, j; uint16_t *dst = (uint16_t*) _dst; ptrdiff_t stride = _stride / 2; int shift; uint8_t shift_2nd = 10; //20 - bit depth uint16_t add_2nd = 1<<9; //shift2 - 1 int16_t *src = coeffs; __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, EEE0l, EEE1l, EEE0h, EEE1h; __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, EE4l, EE7h, EE6h, EE5h, EE4h; m128iS0 = _mm_load_si128((__m128i *) (src)); m128iS1 = _mm_load_si128((__m128i *) (src + 32)); m128iS2 = _mm_load_si128((__m128i *) (src + 64)); m128iS3 = _mm_load_si128((__m128i *) (src + 96)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); m128iS5 = _mm_load_si128((__m128i *) (src + 160)); m128iS6 = _mm_load_si128((__m128i *) (src + 192)); m128iS7 = _mm_load_si128((__m128i *) (src + 224)); m128iS8 = _mm_load_si128((__m128i *) (src + 256)); m128iS9 = _mm_load_si128((__m128i *) (src + 288)); m128iS10 = _mm_load_si128((__m128i *) (src + 320)); m128iS11 = _mm_load_si128((__m128i *) (src + 352)); m128iS12 = _mm_loadu_si128((__m128i *) (src + 384)); m128iS13 = _mm_load_si128((__m128i *) (src + 416)); m128iS14 = _mm_load_si128((__m128i *) (src + 448)); m128iS15 = _mm_load_si128((__m128i *) (src + 480)); m128iS16 = _mm_load_si128((__m128i *) (src + 512)); m128iS17 = _mm_load_si128((__m128i *) (src + 544)); m128iS18 = _mm_load_si128((__m128i *) (src + 576)); m128iS19 = _mm_load_si128((__m128i *) (src + 608)); m128iS20 = _mm_load_si128((__m128i *) (src + 640)); m128iS21 = _mm_load_si128((__m128i *) (src + 672)); m128iS22 = _mm_load_si128((__m128i *) (src + 704)); m128iS23 = _mm_load_si128((__m128i *) (src + 736)); m128iS24 = _mm_load_si128((__m128i *) (src + 768)); m128iS25 = _mm_load_si128((__m128i *) (src + 800)); m128iS26 = _mm_load_si128((__m128i *) (src + 832)); m128iS27 = _mm_load_si128((__m128i *) (src + 864)); m128iS28 = _mm_load_si128((__m128i *) (src + 896)); m128iS29 = _mm_load_si128((__m128i *) (src + 928)); m128iS30 = _mm_load_si128((__m128i *) (src + 960)); m128iS31 = _mm_load_si128((__m128i *) (src + 992)); shift = shift_1st; m128iAdd = _mm_set1_epi32(add_1st); for (j = 0; j < 2; j++) { for (i = 0; i < 32; i += 8) { m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][0]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][0]))); m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][0]))); m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][0]))); m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][0]))); m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][0]))); m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][0]))); m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][0]))); m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][0]))); m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][0]))); m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][0]))); m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][0]))); m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][0]))); m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][0]))); O0l = _mm_add_epi32(E0l, E1l); O0l = _mm_add_epi32(O0l, E2l); O0l = _mm_add_epi32(O0l, E3l); O0l = _mm_add_epi32(O0l, E4l); O0l = _mm_add_epi32(O0l, E5l); O0l = _mm_add_epi32(O0l, E6l); O0l = _mm_add_epi32(O0l, E7l); O0h = _mm_add_epi32(E0h, E1h); O0h = _mm_add_epi32(O0h, E2h); O0h = _mm_add_epi32(O0h, E3h); O0h = _mm_add_epi32(O0h, E4h); O0h = _mm_add_epi32(O0h, E5h); O0h = _mm_add_epi32(O0h, E6h); O0h = _mm_add_epi32(O0h, E7h); /* Compute O1*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][1]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][1]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][1]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][1]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][1]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][1]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][1]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][1]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][1]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][1]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][1]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][1]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][1]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][1]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][1]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][1]))); O1l = _mm_add_epi32(E0l, E1l); O1l = _mm_add_epi32(O1l, E2l); O1l = _mm_add_epi32(O1l, E3l); O1l = _mm_add_epi32(O1l, E4l); O1l = _mm_add_epi32(O1l, E5l); O1l = _mm_add_epi32(O1l, E6l); O1l = _mm_add_epi32(O1l, E7l); O1h = _mm_add_epi32(E0h, E1h); O1h = _mm_add_epi32(O1h, E2h); O1h = _mm_add_epi32(O1h, E3h); O1h = _mm_add_epi32(O1h, E4h); O1h = _mm_add_epi32(O1h, E5h); O1h = _mm_add_epi32(O1h, E6h); O1h = _mm_add_epi32(O1h, E7h); /* Compute O2*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][2]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][2]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][2]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][2]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][2]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][2]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][2]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][2]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][2]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][2]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][2]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][2]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][2]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][2]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][2]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][2]))); O2l = _mm_add_epi32(E0l, E1l); O2l = _mm_add_epi32(O2l, E2l); O2l = _mm_add_epi32(O2l, E3l); O2l = _mm_add_epi32(O2l, E4l); O2l = _mm_add_epi32(O2l, E5l); O2l = _mm_add_epi32(O2l, E6l); O2l = _mm_add_epi32(O2l, E7l); O2h = _mm_add_epi32(E0h, E1h); O2h = _mm_add_epi32(O2h, E2h); O2h = _mm_add_epi32(O2h, E3h); O2h = _mm_add_epi32(O2h, E4h); O2h = _mm_add_epi32(O2h, E5h); O2h = _mm_add_epi32(O2h, E6h); O2h = _mm_add_epi32(O2h, E7h); /* Compute O3*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][3]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][3]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][3]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][3]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][3]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][3]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][3]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][3]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][3]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][3]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][3]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][3]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][3]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][3]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][3]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][3]))); O3l = _mm_add_epi32(E0l, E1l); O3l = _mm_add_epi32(O3l, E2l); O3l = _mm_add_epi32(O3l, E3l); O3l = _mm_add_epi32(O3l, E4l); O3l = _mm_add_epi32(O3l, E5l); O3l = _mm_add_epi32(O3l, E6l); O3l = _mm_add_epi32(O3l, E7l); O3h = _mm_add_epi32(E0h, E1h); O3h = _mm_add_epi32(O3h, E2h); O3h = _mm_add_epi32(O3h, E3h); O3h = _mm_add_epi32(O3h, E4h); O3h = _mm_add_epi32(O3h, E5h); O3h = _mm_add_epi32(O3h, E6h); O3h = _mm_add_epi32(O3h, E7h); /* Compute O4*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][4]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][4]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][4]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][4]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][4]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][4]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][4]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][4]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][4]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][4]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][4]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][4]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][4]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][4]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][4]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][4]))); O4l = _mm_add_epi32(E0l, E1l); O4l = _mm_add_epi32(O4l, E2l); O4l = _mm_add_epi32(O4l, E3l); O4l = _mm_add_epi32(O4l, E4l); O4l = _mm_add_epi32(O4l, E5l); O4l = _mm_add_epi32(O4l, E6l); O4l = _mm_add_epi32(O4l, E7l); O4h = _mm_add_epi32(E0h, E1h); O4h = _mm_add_epi32(O4h, E2h); O4h = _mm_add_epi32(O4h, E3h); O4h = _mm_add_epi32(O4h, E4h); O4h = _mm_add_epi32(O4h, E5h); O4h = _mm_add_epi32(O4h, E6h); O4h = _mm_add_epi32(O4h, E7h); /* Compute O5*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][5]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][5]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][5]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][5]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][5]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][5]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][5]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][5]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][5]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][5]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][5]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][5]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][5]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][5]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][5]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][5]))); O5l = _mm_add_epi32(E0l, E1l); O5l = _mm_add_epi32(O5l, E2l); O5l = _mm_add_epi32(O5l, E3l); O5l = _mm_add_epi32(O5l, E4l); O5l = _mm_add_epi32(O5l, E5l); O5l = _mm_add_epi32(O5l, E6l); O5l = _mm_add_epi32(O5l, E7l); O5h = _mm_add_epi32(E0h, E1h); O5h = _mm_add_epi32(O5h, E2h); O5h = _mm_add_epi32(O5h, E3h); O5h = _mm_add_epi32(O5h, E4h); O5h = _mm_add_epi32(O5h, E5h); O5h = _mm_add_epi32(O5h, E6h); O5h = _mm_add_epi32(O5h, E7h); /* Compute O6*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][6]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][6]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][6]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][6]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][6]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][6]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][6]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][6]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][6]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][6]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][6]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][6]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][6]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][6]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][6]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][6]))); O6l = _mm_add_epi32(E0l, E1l); O6l = _mm_add_epi32(O6l, E2l); O6l = _mm_add_epi32(O6l, E3l); O6l = _mm_add_epi32(O6l, E4l); O6l = _mm_add_epi32(O6l, E5l); O6l = _mm_add_epi32(O6l, E6l); O6l = _mm_add_epi32(O6l, E7l); O6h = _mm_add_epi32(E0h, E1h); O6h = _mm_add_epi32(O6h, E2h); O6h = _mm_add_epi32(O6h, E3h); O6h = _mm_add_epi32(O6h, E4h); O6h = _mm_add_epi32(O6h, E5h); O6h = _mm_add_epi32(O6h, E6h); O6h = _mm_add_epi32(O6h, E7h); /* Compute O7*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][7]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][7]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][7]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][7]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][7]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][7]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][7]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][7]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][7]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][7]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][7]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][7]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][7]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][7]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][7]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][7]))); O7l = _mm_add_epi32(E0l, E1l); O7l = _mm_add_epi32(O7l, E2l); O7l = _mm_add_epi32(O7l, E3l); O7l = _mm_add_epi32(O7l, E4l); O7l = _mm_add_epi32(O7l, E5l); O7l = _mm_add_epi32(O7l, E6l); O7l = _mm_add_epi32(O7l, E7l); O7h = _mm_add_epi32(E0h, E1h); O7h = _mm_add_epi32(O7h, E2h); O7h = _mm_add_epi32(O7h, E3h); O7h = _mm_add_epi32(O7h, E4h); O7h = _mm_add_epi32(O7h, E5h); O7h = _mm_add_epi32(O7h, E6h); O7h = _mm_add_epi32(O7h, E7h); /* Compute O8*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][8]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][8]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][8]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][8]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][8]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][8]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][8]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][8]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][8]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][8]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][8]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][8]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][8]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][8]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][8]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][8]))); O8l = _mm_add_epi32(E0l, E1l); O8l = _mm_add_epi32(O8l, E2l); O8l = _mm_add_epi32(O8l, E3l); O8l = _mm_add_epi32(O8l, E4l); O8l = _mm_add_epi32(O8l, E5l); O8l = _mm_add_epi32(O8l, E6l); O8l = _mm_add_epi32(O8l, E7l); O8h = _mm_add_epi32(E0h, E1h); O8h = _mm_add_epi32(O8h, E2h); O8h = _mm_add_epi32(O8h, E3h); O8h = _mm_add_epi32(O8h, E4h); O8h = _mm_add_epi32(O8h, E5h); O8h = _mm_add_epi32(O8h, E6h); O8h = _mm_add_epi32(O8h, E7h); /* Compute O9*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][9]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][9]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][9]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][9]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][9]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][9]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][9]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][9]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][9]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][9]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][9]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][9]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][9]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][9]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][9]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][9]))); O9l = _mm_add_epi32(E0l, E1l); O9l = _mm_add_epi32(O9l, E2l); O9l = _mm_add_epi32(O9l, E3l); O9l = _mm_add_epi32(O9l, E4l); O9l = _mm_add_epi32(O9l, E5l); O9l = _mm_add_epi32(O9l, E6l); O9l = _mm_add_epi32(O9l, E7l); O9h = _mm_add_epi32(E0h, E1h); O9h = _mm_add_epi32(O9h, E2h); O9h = _mm_add_epi32(O9h, E3h); O9h = _mm_add_epi32(O9h, E4h); O9h = _mm_add_epi32(O9h, E5h); O9h = _mm_add_epi32(O9h, E6h); O9h = _mm_add_epi32(O9h, E7h); /* Compute 10*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][10]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][10]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][10]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][10]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][10]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][10]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][10]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][10]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][10]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][10]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][10]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][10]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][10]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][10]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][10]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][10]))); O10l = _mm_add_epi32(E0l, E1l); O10l = _mm_add_epi32(O10l, E2l); O10l = _mm_add_epi32(O10l, E3l); O10l = _mm_add_epi32(O10l, E4l); O10l = _mm_add_epi32(O10l, E5l); O10l = _mm_add_epi32(O10l, E6l); O10l = _mm_add_epi32(O10l, E7l); O10h = _mm_add_epi32(E0h, E1h); O10h = _mm_add_epi32(O10h, E2h); O10h = _mm_add_epi32(O10h, E3h); O10h = _mm_add_epi32(O10h, E4h); O10h = _mm_add_epi32(O10h, E5h); O10h = _mm_add_epi32(O10h, E6h); O10h = _mm_add_epi32(O10h, E7h); /* Compute 11*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][11]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][11]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][11]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][11]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][11]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][11]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][11]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][11]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][11]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][11]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][11]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][11]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][11]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][11]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][11]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][11]))); O11l = _mm_add_epi32(E0l, E1l); O11l = _mm_add_epi32(O11l, E2l); O11l = _mm_add_epi32(O11l, E3l); O11l = _mm_add_epi32(O11l, E4l); O11l = _mm_add_epi32(O11l, E5l); O11l = _mm_add_epi32(O11l, E6l); O11l = _mm_add_epi32(O11l, E7l); O11h = _mm_add_epi32(E0h, E1h); O11h = _mm_add_epi32(O11h, E2h); O11h = _mm_add_epi32(O11h, E3h); O11h = _mm_add_epi32(O11h, E4h); O11h = _mm_add_epi32(O11h, E5h); O11h = _mm_add_epi32(O11h, E6h); O11h = _mm_add_epi32(O11h, E7h); /* Compute 12*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][12]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][12]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][12]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][12]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][12]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][12]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][12]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][12]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][12]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][12]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][12]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][12]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][12]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][12]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][12]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][12]))); O12l = _mm_add_epi32(E0l, E1l); O12l = _mm_add_epi32(O12l, E2l); O12l = _mm_add_epi32(O12l, E3l); O12l = _mm_add_epi32(O12l, E4l); O12l = _mm_add_epi32(O12l, E5l); O12l = _mm_add_epi32(O12l, E6l); O12l = _mm_add_epi32(O12l, E7l); O12h = _mm_add_epi32(E0h, E1h); O12h = _mm_add_epi32(O12h, E2h); O12h = _mm_add_epi32(O12h, E3h); O12h = _mm_add_epi32(O12h, E4h); O12h = _mm_add_epi32(O12h, E5h); O12h = _mm_add_epi32(O12h, E6h); O12h = _mm_add_epi32(O12h, E7h); /* Compute 13*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][13]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][13]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][13]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][13]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][13]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][13]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][13]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][13]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][13]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][13]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][13]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][13]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][13]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][13]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][13]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][13]))); O13l = _mm_add_epi32(E0l, E1l); O13l = _mm_add_epi32(O13l, E2l); O13l = _mm_add_epi32(O13l, E3l); O13l = _mm_add_epi32(O13l, E4l); O13l = _mm_add_epi32(O13l, E5l); O13l = _mm_add_epi32(O13l, E6l); O13l = _mm_add_epi32(O13l, E7l); O13h = _mm_add_epi32(E0h, E1h); O13h = _mm_add_epi32(O13h, E2h); O13h = _mm_add_epi32(O13h, E3h); O13h = _mm_add_epi32(O13h, E4h); O13h = _mm_add_epi32(O13h, E5h); O13h = _mm_add_epi32(O13h, E6h); O13h = _mm_add_epi32(O13h, E7h); /* Compute O14 */ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][14]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][14]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][14]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][14]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][14]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][14]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][14]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][14]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][14]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][14]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][14]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][14]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][14]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][14]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][14]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][14]))); O14l = _mm_add_epi32(E0l, E1l); O14l = _mm_add_epi32(O14l, E2l); O14l = _mm_add_epi32(O14l, E3l); O14l = _mm_add_epi32(O14l, E4l); O14l = _mm_add_epi32(O14l, E5l); O14l = _mm_add_epi32(O14l, E6l); O14l = _mm_add_epi32(O14l, E7l); O14h = _mm_add_epi32(E0h, E1h); O14h = _mm_add_epi32(O14h, E2h); O14h = _mm_add_epi32(O14h, E3h); O14h = _mm_add_epi32(O14h, E4h); O14h = _mm_add_epi32(O14h, E5h); O14h = _mm_add_epi32(O14h, E6h); O14h = _mm_add_epi32(O14h, E7h); /* Compute O15*/ E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform32x32[0][15]))); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform32x32[0][15]))); E1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform32x32[1][15]))); E1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform32x32[1][15]))); E2l = _mm_madd_epi16(m128Tmp4, _mm_load_si128((__m128i *) (transform32x32[2][15]))); E2h = _mm_madd_epi16(m128Tmp5, _mm_load_si128((__m128i *) (transform32x32[2][15]))); E3l = _mm_madd_epi16(m128Tmp6, _mm_load_si128((__m128i *) (transform32x32[3][15]))); E3h = _mm_madd_epi16(m128Tmp7, _mm_load_si128((__m128i *) (transform32x32[3][15]))); E4l = _mm_madd_epi16(m128Tmp8, _mm_load_si128((__m128i *) (transform32x32[4][15]))); E4h = _mm_madd_epi16(m128Tmp9, _mm_load_si128((__m128i *) (transform32x32[4][15]))); E5l = _mm_madd_epi16(m128Tmp10, _mm_load_si128((__m128i *) (transform32x32[5][15]))); E5h = _mm_madd_epi16(m128Tmp11, _mm_load_si128((__m128i *) (transform32x32[5][15]))); E6l = _mm_madd_epi16(m128Tmp12, _mm_load_si128((__m128i *) (transform32x32[6][15]))); E6h = _mm_madd_epi16(m128Tmp13, _mm_load_si128((__m128i *) (transform32x32[6][15]))); E7l = _mm_madd_epi16(m128Tmp14, _mm_load_si128((__m128i *) (transform32x32[7][15]))); E7h = _mm_madd_epi16(m128Tmp15, _mm_load_si128((__m128i *) (transform32x32[7][15]))); O15l = _mm_add_epi32(E0l, E1l); O15l = _mm_add_epi32(O15l, E2l); O15l = _mm_add_epi32(O15l, E3l); O15l = _mm_add_epi32(O15l, E4l); O15l = _mm_add_epi32(O15l, E5l); O15l = _mm_add_epi32(O15l, E6l); O15l = _mm_add_epi32(O15l, E7l); O15h = _mm_add_epi32(E0h, E1h); O15h = _mm_add_epi32(O15h, E2h); O15h = _mm_add_epi32(O15h, E3h); O15h = _mm_add_epi32(O15h, E4h); O15h = _mm_add_epi32(O15h, E5h); O15h = _mm_add_epi32(O15h, E6h); O15h = _mm_add_epi32(O15h, E7h); /* Compute E0 */ m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); E0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); E0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][0])))); m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][0])))); m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][0])))); m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][0])))); m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); E0l = _mm_add_epi32(E0l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][0])))); m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); E0h = _mm_add_epi32(E0h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][0])))); /* Compute E1 */ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][1])))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][1])))); E1l = _mm_add_epi32(E1l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][1])))); E1h = _mm_add_epi32(E1h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][1])))); /* Compute E2 */ E2l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); E2h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][2])))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][2])))); E2l = _mm_add_epi32(E2l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][2])))); E2h = _mm_add_epi32(E2h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][2])))); /* Compute E3 */ E3l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); E3h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][3])))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][3])))); E3l = _mm_add_epi32(E3l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][3])))); E3h = _mm_add_epi32(E3h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][3])))); /* Compute E4 */ E4l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); E4h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); E4l = _mm_add_epi32(E4l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][4])))); E4h = _mm_add_epi32(E4h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][4])))); E4l = _mm_add_epi32(E4l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][4])))); E4h = _mm_add_epi32(E4h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][4])))); E4l = _mm_add_epi32(E4l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][4])))); E4h = _mm_add_epi32(E4h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][4])))); /* Compute E3 */ E5l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); E5h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); E5l = _mm_add_epi32(E5l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][5])))); E5h = _mm_add_epi32(E5h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][5])))); E5l = _mm_add_epi32(E5l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][5])))); E5h = _mm_add_epi32(E5h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][5])))); E5l = _mm_add_epi32(E5l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][5])))); E5h = _mm_add_epi32(E5h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][5])))); /* Compute E6 */ E6l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); E6h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); E6l = _mm_add_epi32(E6l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][6])))); E6h = _mm_add_epi32(E6h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][6])))); E6l = _mm_add_epi32(E6l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][6])))); E6h = _mm_add_epi32(E6h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][6])))); E6l = _mm_add_epi32(E6l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][6])))); E6h = _mm_add_epi32(E6h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][6])))); /* Compute E7 */ E7l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); E7h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); E7l = _mm_add_epi32(E7l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_1[1][7])))); E7h = _mm_add_epi32(E7h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_1[1][7])))); E7l = _mm_add_epi32(E7l, _mm_madd_epi16(m128Tmp4, _mm_load_si128( (__m128i *) (transform16x16_1[2][7])))); E7h = _mm_add_epi32(E7h, _mm_madd_epi16(m128Tmp5, _mm_load_si128( (__m128i *) (transform16x16_1[2][7])))); E7l = _mm_add_epi32(E7l, _mm_madd_epi16(m128Tmp6, _mm_load_si128( (__m128i *) (transform16x16_1[3][7])))); E7h = _mm_add_epi32(E7h, _mm_madd_epi16(m128Tmp7, _mm_load_si128( (__m128i *) (transform16x16_1[3][7])))); /* Compute EE0 and EEE */ m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); E00l = _mm_add_epi32(E00l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][0])))); m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); E00h = _mm_add_epi32(E00h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][0])))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); E01l = _mm_add_epi32(E01l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][1])))); E01h = _mm_add_epi32(E01h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][1])))); E02l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); E02h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); E02l = _mm_add_epi32(E02l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][2])))); E02h = _mm_add_epi32(E02h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][2])))); E03l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); E03h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); E03l = _mm_add_epi32(E03l, _mm_madd_epi16(m128Tmp2, _mm_load_si128( (__m128i *) (transform16x16_2[1][3])))); E03h = _mm_add_epi32(E03h, _mm_madd_epi16(m128Tmp3, _mm_load_si128( (__m128i *) (transform16x16_2[1][3])))); /* Compute EE0 and EEE */ m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); EEE0l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); EEE0h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); EEE1l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); EEE1h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); /* Compute EE */ EE2l = _mm_sub_epi32(EEE1l, EE1l); EE3l = _mm_sub_epi32(EEE0l, EE0l); EE2h = _mm_sub_epi32(EEE1h, EE1h); EE3h = _mm_sub_epi32(EEE0h, EE0h); EE0l = _mm_add_epi32(EEE0l, EE0l); EE1l = _mm_add_epi32(EEE1l, EE1l); EE0h = _mm_add_epi32(EEE0h, EE0h); EE1h = _mm_add_epi32(EEE1h, EE1h); /**/ EE7l = _mm_sub_epi32(EE0l, E00l); EE6l = _mm_sub_epi32(EE1l, E01l); EE5l = _mm_sub_epi32(EE2l, E02l); EE4l = _mm_sub_epi32(EE3l, E03l); EE7h = _mm_sub_epi32(EE0h, E00h); EE6h = _mm_sub_epi32(EE1h, E01h); EE5h = _mm_sub_epi32(EE2h, E02h); EE4h = _mm_sub_epi32(EE3h, E03h); EE0l = _mm_add_epi32(EE0l, E00l); EE1l = _mm_add_epi32(EE1l, E01l); EE2l = _mm_add_epi32(EE2l, E02l); EE3l = _mm_add_epi32(EE3l, E03l); EE0h = _mm_add_epi32(EE0h, E00h); EE1h = _mm_add_epi32(EE1h, E01h); EE2h = _mm_add_epi32(EE2h, E02h); EE3h = _mm_add_epi32(EE3h, E03h); /* Compute E */ E15l = _mm_sub_epi32(EE0l, E0l); E15l = _mm_add_epi32(E15l, m128iAdd); E14l = _mm_sub_epi32(EE1l, E1l); E14l = _mm_add_epi32(E14l, m128iAdd); E13l = _mm_sub_epi32(EE2l, E2l); E13l = _mm_add_epi32(E13l, m128iAdd); E12l = _mm_sub_epi32(EE3l, E3l); E12l = _mm_add_epi32(E12l, m128iAdd); E11l = _mm_sub_epi32(EE4l, E4l); E11l = _mm_add_epi32(E11l, m128iAdd); E10l = _mm_sub_epi32(EE5l, E5l); E10l = _mm_add_epi32(E10l, m128iAdd); E9l = _mm_sub_epi32(EE6l, E6l); E9l = _mm_add_epi32(E9l, m128iAdd); E8l = _mm_sub_epi32(EE7l, E7l); E8l = _mm_add_epi32(E8l, m128iAdd); E0l = _mm_add_epi32(EE0l, E0l); E0l = _mm_add_epi32(E0l, m128iAdd); E1l = _mm_add_epi32(EE1l, E1l); E1l = _mm_add_epi32(E1l, m128iAdd); E2l = _mm_add_epi32(EE2l, E2l); E2l = _mm_add_epi32(E2l, m128iAdd); E3l = _mm_add_epi32(EE3l, E3l); E3l = _mm_add_epi32(E3l, m128iAdd); E4l = _mm_add_epi32(EE4l, E4l); E4l = _mm_add_epi32(E4l, m128iAdd); E5l = _mm_add_epi32(EE5l, E5l); E5l = _mm_add_epi32(E5l, m128iAdd); E6l = _mm_add_epi32(EE6l, E6l); E6l = _mm_add_epi32(E6l, m128iAdd); E7l = _mm_add_epi32(EE7l, E7l); E7l = _mm_add_epi32(E7l, m128iAdd); E15h = _mm_sub_epi32(EE0h, E0h); E15h = _mm_add_epi32(E15h, m128iAdd); E14h = _mm_sub_epi32(EE1h, E1h); E14h = _mm_add_epi32(E14h, m128iAdd); E13h = _mm_sub_epi32(EE2h, E2h); E13h = _mm_add_epi32(E13h, m128iAdd); E12h = _mm_sub_epi32(EE3h, E3h); E12h = _mm_add_epi32(E12h, m128iAdd); E11h = _mm_sub_epi32(EE4h, E4h); E11h = _mm_add_epi32(E11h, m128iAdd); E10h = _mm_sub_epi32(EE5h, E5h); E10h = _mm_add_epi32(E10h, m128iAdd); E9h = _mm_sub_epi32(EE6h, E6h); E9h = _mm_add_epi32(E9h, m128iAdd); E8h = _mm_sub_epi32(EE7h, E7h); E8h = _mm_add_epi32(E8h, m128iAdd); E0h = _mm_add_epi32(EE0h, E0h); E0h = _mm_add_epi32(E0h, m128iAdd); E1h = _mm_add_epi32(EE1h, E1h); E1h = _mm_add_epi32(E1h, m128iAdd); E2h = _mm_add_epi32(EE2h, E2h); E2h = _mm_add_epi32(E2h, m128iAdd); E3h = _mm_add_epi32(EE3h, E3h); E3h = _mm_add_epi32(E3h, m128iAdd); E4h = _mm_add_epi32(EE4h, E4h); E4h = _mm_add_epi32(E4h, m128iAdd); E5h = _mm_add_epi32(EE5h, E5h); E5h = _mm_add_epi32(E5h, m128iAdd); E6h = _mm_add_epi32(EE6h, E6h); E6h = _mm_add_epi32(E6h, m128iAdd); E7h = _mm_add_epi32(EE7h, E7h); E7h = _mm_add_epi32(E7h, m128iAdd); m128iS0 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); m128iS1 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); m128iS2 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); m128iS3 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); m128iS4 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); m128iS5 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); m128iS6 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); m128iS7 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); m128iS8 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); m128iS9 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); m128iS10 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); m128iS11 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); m128iS12 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); m128iS13 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); m128iS14 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); m128iS15 = _mm_packs_epi32( _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); m128iS31 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); m128iS30 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); m128iS29 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); m128iS28 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); m128iS27 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); m128iS26 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); m128iS25 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); m128iS24 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); m128iS23 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); m128iS22 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); m128iS21 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); m128iS20 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); m128iS19 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); m128iS18 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); m128iS17 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); m128iS16 = _mm_packs_epi32( _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); if (!j) { /* Inverse the matrix */ E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); E0h = _mm_unpacklo_epi16(E0l, E8l); E1h = _mm_unpacklo_epi16(E1l, E9l); E2h = _mm_unpacklo_epi16(E2l, E10l); E3h = _mm_unpacklo_epi16(E3l, E11l); E4h = _mm_unpacklo_epi16(E4l, E12l); E5h = _mm_unpacklo_epi16(E5l, E13l); E6h = _mm_unpacklo_epi16(E6l, E14l); E7h = _mm_unpacklo_epi16(E7l, E15l); E8h = _mm_unpackhi_epi16(E0l, E8l); E9h = _mm_unpackhi_epi16(E1l, E9l); E10h = _mm_unpackhi_epi16(E2l, E10l); E11h = _mm_unpackhi_epi16(E3l, E11l); E12h = _mm_unpackhi_epi16(E4l, E12l); E13h = _mm_unpackhi_epi16(E5l, E13l); E14h = _mm_unpackhi_epi16(E6l, E14l); E15h = _mm_unpackhi_epi16(E7l, E15l); m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); /* */ E0h = _mm_unpacklo_epi16(O0l, O8l); E1h = _mm_unpacklo_epi16(O1l, O9l); E2h = _mm_unpacklo_epi16(O2l, O10l); E3h = _mm_unpacklo_epi16(O3l, O11l); E4h = _mm_unpacklo_epi16(O4l, O12l); E5h = _mm_unpacklo_epi16(O5l, O13l); E6h = _mm_unpacklo_epi16(O6l, O14l); E7h = _mm_unpacklo_epi16(O7l, O15l); E8h = _mm_unpackhi_epi16(O0l, O8l); E9h = _mm_unpackhi_epi16(O1l, O9l); E10h = _mm_unpackhi_epi16(O2l, O10l); E11h = _mm_unpackhi_epi16(O3l, O11l); E12h = _mm_unpackhi_epi16(O4l, O12l); E13h = _mm_unpackhi_epi16(O5l, O13l); E14h = _mm_unpackhi_epi16(O6l, O14l); E15h = _mm_unpackhi_epi16(O7l, O15l); m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); /* */ _mm_store_si128((__m128i *) (src + i), m128iS0); _mm_store_si128((__m128i *) (src + 32 + i), m128iS1); _mm_store_si128((__m128i *) (src + 64 + i), m128iS2); _mm_store_si128((__m128i *) (src + 96 + i), m128iS3); _mm_store_si128((__m128i *) (src + 128 + i), m128iS4); _mm_store_si128((__m128i *) (src + 160 + i), m128iS5); _mm_store_si128((__m128i *) (src + 192 + i), m128iS6); _mm_store_si128((__m128i *) (src + 224 + i), m128iS7); _mm_store_si128((__m128i *) (src + 256 + i), m128iS8); _mm_store_si128((__m128i *) (src + 288 + i), m128iS9); _mm_store_si128((__m128i *) (src + 320 + i), m128iS10); _mm_store_si128((__m128i *) (src + 352 + i), m128iS11); _mm_store_si128((__m128i *) (src + 384 + i), m128iS12); _mm_store_si128((__m128i *) (src + 416 + i), m128iS13); _mm_store_si128((__m128i *) (src + 448 + i), m128iS14); _mm_store_si128((__m128i *) (src + 480 + i), m128iS15); _mm_store_si128((__m128i *) (src + 512 + i), m128iS16); _mm_store_si128((__m128i *) (src + 544 + i), m128iS17); _mm_store_si128((__m128i *) (src + 576 + i), m128iS18); _mm_store_si128((__m128i *) (src + 608 + i), m128iS19); _mm_store_si128((__m128i *) (src + 640 + i), m128iS20); _mm_store_si128((__m128i *) (src + 672 + i), m128iS21); _mm_store_si128((__m128i *) (src + 704 + i), m128iS22); _mm_store_si128((__m128i *) (src + 736 + i), m128iS23); _mm_store_si128((__m128i *) (src + 768 + i), m128iS24); _mm_store_si128((__m128i *) (src + 800 + i), m128iS25); _mm_store_si128((__m128i *) (src + 832 + i), m128iS26); _mm_store_si128((__m128i *) (src + 864 + i), m128iS27); _mm_store_si128((__m128i *) (src + 896 + i), m128iS28); _mm_store_si128((__m128i *) (src + 928 + i), m128iS29); _mm_store_si128((__m128i *) (src + 960 + i), m128iS30); _mm_store_si128((__m128i *) (src + 992 + i), m128iS31); if (i <= 16) { int k = i + 8; m128iS0 = _mm_load_si128((__m128i *) (src + k)); m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); } else { m128iS0 = _mm_load_si128((__m128i *) (src)); m128iS1 = _mm_load_si128((__m128i *) (src + 128)); m128iS2 = _mm_load_si128((__m128i *) (src + 256)); m128iS3 = _mm_load_si128((__m128i *) (src + 384)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 512)); m128iS5 = _mm_load_si128((__m128i *) (src + 640)); m128iS6 = _mm_load_si128((__m128i *) (src + 768)); m128iS7 = _mm_load_si128((__m128i *) (src + 896)); m128iS8 = _mm_load_si128((__m128i *) (src + 8)); m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8)); m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8)); m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8)); m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8)); m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8)); m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8)); m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8)); m128iS16 = _mm_load_si128((__m128i *) (src + 16)); m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16)); m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16)); m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16)); m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16)); m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16)); m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16)); m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16)); m128iS24 = _mm_load_si128((__m128i *) (src + 24)); m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24)); m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24)); m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24)); m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24)); m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24)); m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24)); m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24)); shift = shift_2nd; m128iAdd = _mm_set1_epi32(add_2nd); } } else { int k, m = 0; _mm_storeu_si128((__m128i *) (src), m128iS0); _mm_storeu_si128((__m128i *) (src + 8), m128iS1); _mm_storeu_si128((__m128i *) (src + 16), m128iS2); _mm_storeu_si128((__m128i *) (src + 24), m128iS3); _mm_storeu_si128((__m128i *) (src + 128), m128iS4); _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5); _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6); _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7); _mm_storeu_si128((__m128i *) (src + 256), m128iS8); _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9); _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10); _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11); _mm_storeu_si128((__m128i *) (src + 384), m128iS12); _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13); _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14); _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15); _mm_storeu_si128((__m128i *) (src + 512), m128iS16); _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17); _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18); _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19); _mm_storeu_si128((__m128i *) (src + 640), m128iS20); _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21); _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22); _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23); _mm_storeu_si128((__m128i *) (src + 768), m128iS24); _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25); _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26); _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27); _mm_storeu_si128((__m128i *) (src + 896), m128iS28); _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29); _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30); _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31); dst = (uint16_t*) _dst + (i * stride); for (k = 0; k < 8; k++) { dst[0] = av_clip_uintp2(dst[0] + src[m],10); dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10); dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10); dst[4] = av_clip_uintp2( dst[4] + src[m + 128],10); dst[5] = av_clip_uintp2( dst[5] + src[m + 128 + 8],10); dst[6] = av_clip_uintp2( dst[6] + src[m + 128 + 16],10); dst[7] = av_clip_uintp2( dst[7] + src[m + 128 + 24],10); dst[8] = av_clip_uintp2( dst[8] + src[m + 256],10); dst[9] = av_clip_uintp2( dst[9] + src[m + 256 + 8],10); dst[10] = av_clip_uintp2( dst[10] + src[m + 256 + 16],10); dst[11] = av_clip_uintp2( dst[11] + src[m + 256 + 24],10); dst[12] = av_clip_uintp2( dst[12] + src[m + 384],10); dst[13] = av_clip_uintp2( dst[13] + src[m + 384 + 8],10); dst[14] = av_clip_uintp2( dst[14] + src[m + 384 + 16],10); dst[15] = av_clip_uintp2( dst[15] + src[m + 384 + 24],10); dst[16] = av_clip_uintp2( dst[16] + src[m + 512],10); dst[17] = av_clip_uintp2( dst[17] + src[m + 512 + 8],10); dst[18] = av_clip_uintp2( dst[18] + src[m + 512 + 16],10); dst[19] = av_clip_uintp2( dst[19] + src[m + 512 + 24],10); dst[20] = av_clip_uintp2( dst[20] + src[m + 640],10); dst[21] = av_clip_uintp2( dst[21] + src[m + 640 + 8],10); dst[22] = av_clip_uintp2( dst[22] + src[m + 640 + 16],10); dst[23] = av_clip_uintp2( dst[23] + src[m + 640 + 24],10); dst[24] = av_clip_uintp2( dst[24] + src[m + 768],10); dst[25] = av_clip_uintp2( dst[25] + src[m + 768 + 8],10); dst[26] = av_clip_uintp2( dst[26] + src[m + 768 + 16],10); dst[27] = av_clip_uintp2( dst[27] + src[m + 768 + 24],10); dst[28] = av_clip_uintp2( dst[28] + src[m + 896],10); dst[29] = av_clip_uintp2( dst[29] + src[m + 896 + 8],10); dst[30] = av_clip_uintp2( dst[30] + src[m + 896 + 16],10); dst[31] = av_clip_uintp2( dst[31] + src[m + 896 + 24],10); m += 1; dst += stride; } if (i <= 16) { int k = (i + 8) * 4; m128iS0 = _mm_load_si128((__m128i *) (src + k)); m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k)); m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k)); m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k)); m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k)); m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k)); m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k)); m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k)); m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k)); m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k)); m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k)); m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k)); m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k)); m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k)); m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k)); m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k)); m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k)); m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k)); m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k)); m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k)); m128iS20 = _mm_loadu_si128( (__m128i *) (src + 512 + 16 + k)); m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k)); m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k)); m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k)); m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k)); m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k)); m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k)); m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k)); m128iS28 = _mm_loadu_si128( (__m128i *) (src + 512 + 24 + k)); m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k)); m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k)); m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k)); } } } } } #endif libde265-1.0.18/libde265/x86/sse-dct.h000066400000000000000000000027311515675107500166100ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013 openHEVC contributors * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef SSE_DCT_H #define SSE_DCT_H #include #include void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_4x4_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_8x8_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_16x16_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_32x32_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); #endif libde265-1.0.18/libde265/x86/sse-motion.cc000066400000000000000000005604111515675107500175050ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013 openHEVC contributors * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include // SSSE3 #if HAVE_SSE4_1 #include #endif #include "sse-motion.h" #include "libde265/util.h" ALIGNED_16(const int8_t) epel_filters[7][16] = { { -2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2 }, { -4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2 }, { -6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4 }, { -4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4 }, { -4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6 }, { -2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4 }, { -2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2 }, }; static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 }; //static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 }; static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 }; static const int epel_extra_before = 1; //static const int epel_extra_after = 2; static const int epel_extra = 3; #define MAX_PB_SIZE 64 #define MASKMOVE 0 void print128(const char* prefix, __m128i r) { unsigned char buf[16]; *(__m128i*)buf = r; printf("%s ",prefix); for (int i=0;i<16;i++) { if (i>0) { printf(":"); } printf("%02x", buf[i]); } printf("\n"); } void printm32(const char* prefix, unsigned char* p) { printf("%s ",prefix); for (int i=0;i<4;i++) { if (i>0) { printf(":"); } printf("%02x", p[i]); } printf("\n"); } #define BIT_DEPTH 8 void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height) { int x, y; uint8_t *dst = (uint8_t*) _dst; __m128i r0, r1, f0; f0 = _mm_set1_epi16(32); if(!(width & 15)) { for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { r0 = _mm_load_si128((__m128i *) (src+x)); r1 = _mm_load_si128((__m128i *) (src+x + 8)); r0 = _mm_adds_epi16(r0, f0); r1 = _mm_adds_epi16(r1, f0); r0 = _mm_srai_epi16(r0, 6); r1 = _mm_srai_epi16(r1, 6); r0 = _mm_packus_epi16(r0, r1); _mm_storeu_si128((__m128i *) (dst+x), r0); } dst += dststride; src += srcstride; } }else if(!(width & 7)) { for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { r0 = _mm_load_si128((__m128i *) (src+x)); r0 = _mm_adds_epi16(r0, f0); r0 = _mm_srai_epi16(r0, 6); r0 = _mm_packus_epi16(r0, r0); _mm_storel_epi64((__m128i *) (dst+x), r0); } dst += dststride; src += srcstride; } }else if(!(width & 3)){ for (y = 0; y < height; y++) { for(x = 0;x < width; x+=4){ r0 = _mm_loadl_epi64((__m128i *) (src+x)); r0 = _mm_adds_epi16(r0, f0); r0 = _mm_srai_epi16(r0, 6); r0 = _mm_packus_epi16(r0, r0); #if MASKMOVE _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); #else //r0 = _mm_shuffle_epi32 (r0, 0x00); *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0); #endif } dst += dststride; src += srcstride; } }else{ for (y = 0; y < height; y++) { for(x = 0;x < width; x+=2){ r0 = _mm_loadl_epi64((__m128i *) (src+x)); r0 = _mm_adds_epi16(r0, f0); r0 = _mm_srai_epi16(r0, 6); r0 = _mm_packus_epi16(r0, r0); #if MASKMOVE _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x)); #else *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0); #endif } dst += dststride; src += srcstride; } } } void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height) { int x, y; uint8_t *dst = (uint8_t*) _dst; ptrdiff_t dststride = _dststride / sizeof(uint8_t); __m128i r0, r1, f0; int shift = 14 - BIT_DEPTH; #if BIT_DEPTH < 14 int16_t offset = 1 << (shift - 1); #else int16_t offset = 0; #endif f0 = _mm_set1_epi16(offset); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { r0 = _mm_load_si128((__m128i *) &src[x]); r1 = _mm_load_si128((__m128i *) &src[x + 8]); r0 = _mm_adds_epi16(r0, f0); r1 = _mm_adds_epi16(r1, f0); r0 = _mm_srai_epi16(r0, shift); r1 = _mm_srai_epi16(r1, shift); r0 = _mm_packus_epi16(r0, r1); _mm_storeu_si128((__m128i *) &dst[x], r0); } dst += dststride; src += srcstride; } } void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height) { int x, y; uint8_t *dst = (uint8_t*) _dst; __m128i r0, r1, f0, r2, r3; f0 = _mm_set1_epi16(64); if(!(width & 15)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { r0 = _mm_load_si128((__m128i *) &src1[x]); r1 = _mm_load_si128((__m128i *) &src1[x + 8]); r2 = _mm_load_si128((__m128i *) &src2[x]); r3 = _mm_load_si128((__m128i *) &src2[x + 8]); r0 = _mm_adds_epi16(r0, f0); r1 = _mm_adds_epi16(r1, f0); r0 = _mm_adds_epi16(r0, r2); r1 = _mm_adds_epi16(r1, r3); r0 = _mm_srai_epi16(r0, 7); r1 = _mm_srai_epi16(r1, 7); r0 = _mm_packus_epi16(r0, r1); _mm_storeu_si128((__m128i *) (dst + x), r0); } dst += dststride; src1 += srcstride; src2 += srcstride; } }else if(!(width & 7)){ for (y = 0; y < height; y++) { for(x=0;x= 1){ if(!(width & 15)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { x0 = _mm_load_si128((__m128i *) &src[x]); x2 = _mm_load_si128((__m128i *) &src[x + 8]); x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), _mm_mulhi_epi16(x2, c0)); x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), _mm_mulhi_epi16(x2, c0)); x0 = _mm_add_epi32(x0, add2); x1 = _mm_add_epi32(x1, add2); x2 = _mm_add_epi32(x2, add2); x3 = _mm_add_epi32(x3, add2); x0 = _mm_srai_epi32(x0, log2Wd); x1 = _mm_srai_epi32(x1, log2Wd); x2 = _mm_srai_epi32(x2, log2Wd); x3 = _mm_srai_epi32(x3, log2Wd); x0 = _mm_add_epi32(x0, add); x1 = _mm_add_epi32(x1, add); x2 = _mm_add_epi32(x2, add); x3 = _mm_add_epi32(x3, add); x0 = _mm_packus_epi32(x0, x1); x2 = _mm_packus_epi32(x2, x3); x0 = _mm_packus_epi16(x0, x2); _mm_storeu_si128((__m128i *) (dst + x), x0); } dst += dststride; src += srcstride; } }else if(!(width & 7)){ for (y = 0; y < height; y++) { for(x=0;x= 1) for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { x0 = _mm_load_si128((__m128i *) &src[x]); x2 = _mm_load_si128((__m128i *) &src[x + 8]); x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), _mm_mulhi_epi16(x2, c0)); x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), _mm_mulhi_epi16(x2, c0)); x0 = _mm_add_epi32(x0, add2); x1 = _mm_add_epi32(x1, add2); x2 = _mm_add_epi32(x2, add2); x3 = _mm_add_epi32(x3, add2); x0 = _mm_srai_epi32(x0, log2Wd); x1 = _mm_srai_epi32(x1, log2Wd); x2 = _mm_srai_epi32(x2, log2Wd); x3 = _mm_srai_epi32(x3, log2Wd); x0 = _mm_add_epi32(x0, add); x1 = _mm_add_epi32(x1, add); x2 = _mm_add_epi32(x2, add); x3 = _mm_add_epi32(x3, add); x0 = _mm_packus_epi32(x0, x1); x2 = _mm_packus_epi32(x2, x3); x0 = _mm_packus_epi16(x0, x2); _mm_storeu_si128((__m128i *) (dst + x), x0); } dst += dststride; src += srcstride; } else for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { x0 = _mm_load_si128((__m128i *) &src[x]); x2 = _mm_load_si128((__m128i *) &src[x + 8]); x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), _mm_mulhi_epi16(x2, c0)); x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), _mm_mulhi_epi16(x2, c0)); x0 = _mm_add_epi32(x0, add2); x1 = _mm_add_epi32(x1, add2); x2 = _mm_add_epi32(x2, add2); x3 = _mm_add_epi32(x3, add2); x0 = _mm_packus_epi32(x0, x1); x2 = _mm_packus_epi32(x2, x3); x0 = _mm_packus_epi16(x0, x2); _mm_storeu_si128((__m128i *) (dst + x), x0); } dst += dststride; src += srcstride; } } #endif #if HAVE_SSE4_1 void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst, ptrdiff_t _dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height) { int shift, shift2; int log2Wd; int o0; int o1; int x, y; uint8_t *dst = (uint8_t*) _dst; ptrdiff_t dststride = _dststride / sizeof(uint8_t); __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2; shift = 14 - BIT_DEPTH; log2Wd = denom + shift; o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8)); o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8)); shift2 = (log2Wd + 1); c0 = _mm_set1_epi16(wl0Flag); c1 = _mm_set1_epi16(wl1Flag); c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd); if(!(width & 15)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { x0 = _mm_load_si128((__m128i *) &src1[x]); x1 = _mm_load_si128((__m128i *) &src1[x + 8]); x2 = _mm_load_si128((__m128i *) &src2[x]); x3 = _mm_load_si128((__m128i *) &src2[x + 8]); r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0), _mm_mulhi_epi16(x1, c0)); r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1), _mm_mulhi_epi16(x2, c1)); r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1), _mm_mulhi_epi16(x3, c1)); x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), _mm_mulhi_epi16(x0, c0)); x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0), _mm_mulhi_epi16(x1, c0)); x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1), _mm_mulhi_epi16(x2, c1)); x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1), _mm_mulhi_epi16(x3, c1)); r0 = _mm_add_epi32(r0, r2); r1 = _mm_add_epi32(r1, r3); r2 = _mm_add_epi32(x0, x2); r3 = _mm_add_epi32(x1, x3); r0 = _mm_add_epi32(r0, c2); r1 = _mm_add_epi32(r1, c2); r2 = _mm_add_epi32(r2, c2); r3 = _mm_add_epi32(r3, c2); r0 = _mm_srai_epi32(r0, shift2); r1 = _mm_srai_epi32(r1, shift2); r2 = _mm_srai_epi32(r2, shift2); r3 = _mm_srai_epi32(r3, shift2); r0 = _mm_packus_epi32(r0, r2); r1 = _mm_packus_epi32(r1, r3); r0 = _mm_packus_epi16(r0, r1); _mm_storeu_si128((__m128i *) (dst + x), r0); } dst += dststride; src1 += srcstride; src2 += srcstride; } }else if(!(width & 7)){ for (y = 0; y < height; y++) { for(x=0;x>1; if(!(width & 7)){ //x1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { x2 = _mm_loadu_si128((__m128i *) &src[x]); x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH _mm_store_si128((__m128i *) &dst[x], x2); } src += srcstride; dst += dststride; } }else if(!(width & 3)){ //x1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 4) { x2 = _mm_loadl_epi64((__m128i *) &src[x]); x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH _mm_storel_epi64((__m128i *) &dst[x], x2); } src += srcstride; dst += dststride; } }else{ //x1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 2) { x2 = _mm_loadl_epi64((__m128i *) &src[x]); x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); } src += srcstride; dst += dststride; } } } #endif void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) { int x, y; const uint8_t *src = (const uint8_t*) _src; ptrdiff_t srcstride = _srcstride; const int8_t *filter = epel_filters[mx - 1]; __m128i r0, bshuffle1, bshuffle2, x1, x2, x3; int8_t filter_0 = filter[0]; int8_t filter_1 = filter[1]; int8_t filter_2 = filter[2]; int8_t filter_3 = filter[3]; r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0); bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); /* printf("---IN---SSE\n"); int extra_top = 1; int extra_left = 1; int extra_right = 2; int extra_bottom = 2; for (int y=-extra_top;y>1; const int8_t *filter = epel_filters[mx - 1]; __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1; int8_t filter_0 = filter[0]; int8_t filter_1 = filter[1]; int8_t filter_2 = filter[2]; int8_t filter_3 = filter[3]; r0 = _mm_set_epi16(filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0); bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0); if(!(width & 3)){ bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 4) { x1 = _mm_loadu_si128((__m128i *) &src[x-1]); x2 = _mm_shuffle_epi8(x1, bshuffle1); x3 = _mm_shuffle_epi8(x1, bshuffle2); x2 = _mm_madd_epi16(x2, r0); x3 = _mm_madd_epi16(x3, r0); x2 = _mm_hadd_epi32(x2, x3); x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8) x2 = _mm_packs_epi32(x2,r0); //give results back _mm_storel_epi64((__m128i *) &dst[x], x2); } src += srcstride; dst += dststride; } }else{ r1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 2) { /* load data in register */ x1 = _mm_loadu_si128((__m128i *) &src[x-1]); x2 = _mm_shuffle_epi8(x1, bshuffle1); /* PMADDUBSW then PMADDW */ x2 = _mm_madd_epi16(x2, r0); x2 = _mm_hadd_epi32(x2, r1); x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8) x2 = _mm_packs_epi32(x2, r1); /* give results back */ _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); } src += srcstride; dst += dststride; } } } #endif void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) { int x, y; __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1; uint8_t *src = (uint8_t*) _src; ptrdiff_t srcstride = _srcstride / sizeof(uint8_t); const int8_t *filter = epel_filters[my - 1]; int8_t filter_0 = filter[0]; int8_t filter_1 = filter[1]; int8_t filter_2 = filter[2]; int8_t filter_3 = filter[3]; f0 = _mm_set1_epi16(filter_0); f1 = _mm_set1_epi16(filter_1); f2 = _mm_set1_epi16(filter_2); f3 = _mm_set1_epi16(filter_3); if(!(width & 15)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { /* check if memory needs to be reloaded */ x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]); x1 = _mm_loadu_si128((__m128i *) &src[x]); x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]); x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]); t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128()); t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128()); t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128()); t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128()); x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128()); x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128()); x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128()); x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128()); /* multiply by correct value : */ r0 = _mm_mullo_epi16(t0, f0); r1 = _mm_mullo_epi16(x0, f0); r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1)); r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1)); r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2)); r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2)); r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3)); r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3)); /* give results back */ _mm_store_si128((__m128i *) &dst[x], r0); _mm_storeu_si128((__m128i *) &dst[x + 8], r1); } src += srcstride; dst += dststride; } }else if(!(width & 7)){ r1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for(x=0;x>1; const int8_t *filter = epel_filters[my - 1]; int8_t filter_0 = filter[0]; int8_t filter_1 = filter[1]; int8_t filter_2 = filter[2]; int8_t filter_3 = filter[3]; f0 = _mm_set1_epi16(filter_0); f1 = _mm_set1_epi16(filter_1); f2 = _mm_set1_epi16(filter_2); f3 = _mm_set1_epi16(filter_3); if(!(width & 7)){ r1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for(x=0;x> (BIT_DEPTH - 8) t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8) r0= _mm_packs_epi32(r0, t0); // give results back _mm_storeu_si128((__m128i *) &dst[x], r0); } src += srcstride; dst += dststride; } }else if(!(width & 3)){ r1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for(x=0;x> (BIT_DEPTH - 8) r0= _mm_packs_epi32(r0, r0); // give results back _mm_storel_epi64((__m128i *) &dst[x], r0); } src += srcstride; dst += dststride; } }else{ r1= _mm_setzero_si128(); for (y = 0; y < height; y++) { for(x=0;x> (BIT_DEPTH - 8) r0= _mm_packs_epi32(r0, r0); /* give results back */ _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); } src += srcstride; dst += dststride; } } } #endif void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth) { int x, y; uint8_t *src = (uint8_t*) _src; ptrdiff_t srcstride = _srcstride; const int8_t *filter_h = epel_filters[mx - 1]; const int8_t *filter_v = epel_filters[my - 1]; __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1, f2, f3, r1, r2; int8_t filter_0 = filter_h[0]; int8_t filter_1 = filter_h[1]; int8_t filter_2 = filter_h[2]; int8_t filter_3 = filter_h[3]; int16_t *tmp = mcbuffer; r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0); bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); src -= epel_extra_before * srcstride; f3 = _mm_set1_epi16(filter_v[3]); f1 = _mm_set1_epi16(filter_v[1]); f2 = _mm_set1_epi16(filter_v[2]); f0 = _mm_set1_epi16(filter_v[0]); /* horizontal treatment */ if(!(width & 7)){ bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4); for (y = 0; y < height + epel_extra; y++) { for (x = 0; x < width; x += 8) { x1 = _mm_loadu_si128((__m128i *) &src[x - 1]); x2 = _mm_shuffle_epi8(x1, bshuffle1); x3 = _mm_shuffle_epi8(x1, bshuffle2); /* PMADDUBSW then PMADDW */ x2 = _mm_maddubs_epi16(x2, r0); x3 = _mm_maddubs_epi16(x3, r0); x2 = _mm_hadd_epi16(x2, x3); _mm_store_si128((__m128i *) &tmp[x], x2); } src += srcstride; tmp += MAX_PB_SIZE; } tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; /* vertical treatment */ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { /* check if memory needs to be reloaded */ x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]); x1 = _mm_load_si128((__m128i *) &tmp[x]); x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]); x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); r0 = _mm_mullo_epi16(x0, f0); r1 = _mm_mulhi_epi16(x0, f0); r2 = _mm_mullo_epi16(x1, f1); t0 = _mm_unpacklo_epi16(r0, r1); x0 = _mm_unpackhi_epi16(r0, r1); r0 = _mm_mulhi_epi16(x1, f1); r1 = _mm_mullo_epi16(x2, f2); t1 = _mm_unpacklo_epi16(r2, r0); x1 = _mm_unpackhi_epi16(r2, r0); r2 = _mm_mulhi_epi16(x2, f2); r0 = _mm_mullo_epi16(x3, f3); t2 = _mm_unpacklo_epi16(r1, r2); x2 = _mm_unpackhi_epi16(r1, r2); r1 = _mm_mulhi_epi16(x3, f3); t3 = _mm_unpacklo_epi16(r0, r1); x3 = _mm_unpackhi_epi16(r0, r1); /* multiply by correct value : */ r0 = _mm_add_epi32(t0, t1); r1 = _mm_add_epi32(x0, x1); r0 = _mm_add_epi32(r0, t2); r1 = _mm_add_epi32(r1, x2); r0 = _mm_add_epi32(r0, t3); r1 = _mm_add_epi32(r1, x3); r0 = _mm_srai_epi32(r0, 6); r1 = _mm_srai_epi32(r1, 6); /* give results back */ r0 = _mm_packs_epi32(r0, r1); _mm_store_si128((__m128i *) &dst[x], r0); } tmp += MAX_PB_SIZE; dst += dststride; } }else if(!(width & 3)){ for (y = 0; y < height + epel_extra; y ++) { for(x=0;x>1; const int8_t *filter_h = epel_filters[mx - 1]; const int8_t *filter_v = epel_filters[my - 1]; __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1, f2, f3, r1, r2, r3; int8_t filter_0 = filter_h[0]; int8_t filter_1 = filter_h[1]; int8_t filter_2 = filter_h[2]; int8_t filter_3 = filter_h[3]; int16_t *tmp = mcbuffer; r0 = _mm_set_epi16(filter_3, filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, filter_0); bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0); src -= epel_extra_before * srcstride; f0 = _mm_set1_epi16(filter_v[0]); f1 = _mm_set1_epi16(filter_v[1]); f2 = _mm_set1_epi16(filter_v[2]); f3 = _mm_set1_epi16(filter_v[3]); /* horizontal treatment */ if(!(width & 3)){ bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4); for (y = 0; y < height + epel_extra; y ++) { for(x=0;x> (BIT_DEPTH - 8) x2 = _mm_packs_epi32(x2,r0); //give results back _mm_storel_epi64((__m128i *) &tmp[x], x2); } src += srcstride; tmp += MAX_PB_SIZE; } tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; // vertical treatment for (y = 0; y < height; y++) { for (x = 0; x < width; x += 4) { x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]); x1 = _mm_loadl_epi64((__m128i *) &tmp[x]); x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]); x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); r0 = _mm_mullo_epi16(x0, f0); r1 = _mm_mulhi_epi16(x0, f0); r2 = _mm_mullo_epi16(x1, f1); t0 = _mm_unpacklo_epi16(r0, r1); r0 = _mm_mulhi_epi16(x1, f1); r1 = _mm_mullo_epi16(x2, f2); t1 = _mm_unpacklo_epi16(r2, r0); r2 = _mm_mulhi_epi16(x2, f2); r0 = _mm_mullo_epi16(x3, f3); t2 = _mm_unpacklo_epi16(r1, r2); r1 = _mm_mulhi_epi16(x3, f3); t3 = _mm_unpacklo_epi16(r0, r1); r0 = _mm_add_epi32(t0, t1); r0 = _mm_add_epi32(r0, t2); r0 = _mm_add_epi32(r0, t3); r0 = _mm_srai_epi32(r0, 6); // give results back r0 = _mm_packs_epi32(r0, r0); _mm_storel_epi64((__m128i *) &dst[x], r0); } tmp += MAX_PB_SIZE; dst += dststride; } }else{ bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1); r1= _mm_setzero_si128(); for (y = 0; y < height + epel_extra; y ++) { for(x=0;x> (BIT_DEPTH - 8) x2 = _mm_packs_epi32(x2, r1); /* give results back */ _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x)); } src += srcstride; tmp += MAX_PB_SIZE; } tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; /* vertical treatment */ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 2) { /* check if memory needs to be reloaded */ x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]); x1 = _mm_loadl_epi64((__m128i *) &tmp[x]); x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]); x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); r0 = _mm_mullo_epi16(x0, f0); t0 = _mm_mulhi_epi16(x0, f0); x0= _mm_unpacklo_epi16(r0,t0); r1 = _mm_mullo_epi16(x1, f1); t1 = _mm_mulhi_epi16(x1, f1); x1= _mm_unpacklo_epi16(r1,t1); r2 = _mm_mullo_epi16(x2, f2); t2 = _mm_mulhi_epi16(x2, f2); x2= _mm_unpacklo_epi16(r2,t2); r3 = _mm_mullo_epi16(x3, f3); t3 = _mm_mulhi_epi16(x3, f3); x3= _mm_unpacklo_epi16(r3,t3); r0= _mm_add_epi32(x0,x1); r1= _mm_add_epi32(x2,x3); r0= _mm_add_epi32(r0,r1); r0 = _mm_srai_epi32(r0, 6); /* give results back */ r0 = _mm_packs_epi32(r0, r0); _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x)); } tmp += MAX_PB_SIZE; dst += dststride; } } } #endif void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int16_t* mcbuffer) { int x, y; __m128i x1, x2, x3, x0; uint8_t *src = (uint8_t*) _src; ptrdiff_t srcstride = _srcstride; x0= _mm_setzero_si128(); if(!(width & 15)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { x1 = _mm_loadu_si128((__m128i *) &src[x]); x2 = _mm_unpacklo_epi8(x1, x0); x3 = _mm_unpackhi_epi8(x1, x0); x2 = _mm_slli_epi16(x2, 6); x3 = _mm_slli_epi16(x3, 6); _mm_storeu_si128((__m128i *) &dst[x], x2); _mm_storeu_si128((__m128i *) &dst[x + 8], x3); } src += srcstride; dst += dststride; } }else if(!(width & 7)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { x1 = _mm_loadu_si128((__m128i *) &src[x]); x2 = _mm_unpacklo_epi8(x1, x0); x2 = _mm_slli_epi16(x2, 6); _mm_storeu_si128((__m128i *) &dst[x], x2); } src += srcstride; dst += dststride; } }else if(!(width & 3)){ for (y = 0; y < height; y++) { for(x=0;x>1; if(!(width & 7)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { x1 = _mm_loadu_si128((__m128i *) &src[x]); x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH _mm_storeu_si128((__m128i *) &dst[x], x2); } src += srcstride; dst += dststride; } }else if(!(width & 3)){ for (y = 0; y < height; y++) { for(x=0;x>1; __m128i x0, x1, x2, x3, r0; r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1); x0= _mm_setzero_si128(); x3= _mm_set_epi32(0,0,0,-1); for (y = 0; y < height; y ++) { for(x=0;x>BIT_DEPTH-8 x1= _mm_packs_epi32(x1,x0); // dst[x]= _mm_extract_epi16(x1,0); _mm_maskmoveu_si128(x1,x3,(char *) (dst+x)); } src += srcstride; dst += dststride; } } #endif void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int16_t* mcbuffer) { int x, y; const uint8_t *src = _src; ptrdiff_t srcstride = _srcstride / sizeof(uint8_t); __m128i x1, r0, x2, x3, x4, x5; r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1); /* LOAD src from memory to registers to limit memory bandwidth */ if(!(width - 15)){ for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { /* load data in register */ x1 = _mm_loadu_si128((__m128i *) &src[x - 3]); x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1)); x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2), _mm_srli_si128(x1, 3)); x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4), _mm_srli_si128(x1, 5)); x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6), _mm_srli_si128(x1, 7)); /* PMADDUBSW then PMADDW */ x2 = _mm_maddubs_epi16(x2, r0); x3 = _mm_maddubs_epi16(x3, r0); x4 = _mm_maddubs_epi16(x4, r0); x5 = _mm_maddubs_epi16(x5, r0); x2 = _mm_hadd_epi16(x2, x3); x4 = _mm_hadd_epi16(x4, x5); x2 = _mm_hadd_epi16(x2, x4); /* give results back */ _mm_store_si128((__m128i *) &dst[x],x2); } src += srcstride; dst += dststride; } }else{ for (y = 0; y < height; y ++) { for(x=0;x> 1; __m128i x1, x2, x3, x4, x5, x6, x7, r1; __m128i t1, t2, t3, t4, t5, t6, t7, t8; t7= _mm_set1_epi32(1); t6= _mm_set1_epi32(-5); t5= _mm_set1_epi32(17); t4= _mm_set1_epi32(58); t3= _mm_set1_epi32(-10); t2= _mm_set1_epi32(4); t1= _mm_set1_epi32(-1); t8= _mm_setzero_si128(); for (y = 0; y < height; y ++) { for(x=0;x> 1; __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2; __m128i t1, t2, t3, t4, t5, t6, t7, t8; r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1); t1= _mm_set1_epi32(-1); t2= _mm_set1_epi32(4); t3= _mm_set1_epi32(-11); t4= _mm_set1_epi32(40); t5= _mm_set1_epi32(40); t6= _mm_set1_epi32(-11); t7= _mm_set1_epi32(4); t8= _mm_set1_epi32(-1); { x = 0; r0 = _mm_setzero_si128(); for (y = 0; y < height; y ++) { for(x=0;x> 1; __m128i x1, x2, x3, x4, x5, x6, x7, r0; __m128i t1, t2, t3, t4, t5, t6, t7, t8; t7 = _mm_set1_epi32(-1); t6 = _mm_set1_epi32(4); t5 = _mm_set1_epi32(-10); t4 = _mm_set1_epi32(58); t3 = _mm_set1_epi32(17); t2 = _mm_set1_epi32(-5); t1 = _mm_set1_epi32(1); t8= _mm_setzero_si128(); { for (y = 0; y < height; y ++) { for(x=0;x * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef SSE_MOTION_H #define SSE_MOTION_H #include #include void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src, ptrdiff_t srcstride, int width, int height); void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, int width, int height); void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer); void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx, int my, int16_t* mcbuffer, int bit_depth); void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int width, int height, int16_t* mcbuffer); #endif libde265-1.0.18/libde265/x86/sse.cc000066400000000000000000000074761515675107500162110ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifdef _MSC_VER #include #endif #include "x86/sse.h" #include "x86/sse-motion.h" #include "x86/sse-dct.h" #ifdef HAVE_CONFIG_H #include "config.h" #endif #if defined(__GNUC__) && !defined(__EMSCRIPTEN__) #include #endif void init_acceleration_functions_sse(struct acceleration_functions* accel) { uint32_t ecx=0,edx=0; #ifdef _MSC_VER uint32_t regs[4]; int a = 1; __cpuid((int *)regs, (int)a); ecx = regs[2]; edx = regs[3]; #elif !defined(__EMSCRIPTEN__) uint32_t eax,ebx; __get_cpuid(1, &eax,&ebx,&ecx,&edx); #endif #ifdef __EMSCRIPTEN__ int have_SSE = 0; int have_SSE4_1 = 0; #ifdef __SSE__ have_SSE = 1; #endif #ifdef __SSE4_1__ have_SSE4_1 = 1; #endif #else // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]); //int have_MMX = !!(edx & (1<<23)); int have_SSE = !!(edx & (1<<25)); int have_SSE4_1 = !!(ecx & (1<<19)); // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1); if (have_SSE) { } #endif #if HAVE_SSE4_1 if (have_SSE4_1) { accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse; accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse; accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse; accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse; accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse; accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse; accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse; accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse; accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse; accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse; accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse; accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse; accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse; accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse; accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse; accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse; accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse; accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse; accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse; accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse; accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse; accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse; accel->transform_skip_8 = ff_hevc_transform_skip_8_sse; // actually, for these two functions, the scalar fallback seems to be faster than the SSE code //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4; accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4; accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4; accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4; } #endif } libde265-1.0.18/libde265/x86/sse.h000066400000000000000000000016641515675107500160440ustar00rootroot00000000000000/* * H.265 video codec. * Copyright (c) 2013-2014 struktur AG, Dirk Farin * * This file is part of libde265. * * libde265 is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * libde265 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libde265. If not, see . */ #ifndef DE265_SSE_H #define DE265_SSE_H #include "acceleration.h" void init_acceleration_functions_sse(struct acceleration_functions* accel); #endif libde265-1.0.18/sherlock265/000077500000000000000000000000001515675107500152005ustar00rootroot00000000000000libde265-1.0.18/sherlock265/CMakeLists.txt000066400000000000000000000014001515675107500177330ustar00rootroot00000000000000find_package(Qt5 COMPONENTS Core Gui Widgets QUIET) if(NOT Qt5_FOUND) message(STATUS "Qt5 not found, skipping sherlock265") return() endif() set(CMAKE_AUTOMOC ON) add_executable(sherlock265 sherlock265.cc VideoPlayer.cc VideoDecoder.cc VideoWidget.cc VideoPlayer.h VideoDecoder.h VideoWidget.h ) target_link_libraries(sherlock265 PRIVATE de265 Qt5::Core Qt5::Gui Qt5::Widgets Threads::Threads ) find_package(PkgConfig QUIET) if(PkgConfig_FOUND) pkg_check_modules(SWSCALE IMPORTED_TARGET libswscale) if(SWSCALE_FOUND) target_compile_definitions(sherlock265 PRIVATE HAVE_SWSCALE) target_link_libraries(sherlock265 PRIVATE PkgConfig::SWSCALE) endif() endif() install(TARGETS sherlock265 DESTINATION ${CMAKE_INSTALL_BINDIR}) libde265-1.0.18/sherlock265/COPYING000066400000000000000000000021061515675107500162320ustar00rootroot00000000000000 MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. libde265-1.0.18/sherlock265/README000066400000000000000000000023011515675107500160540ustar00rootroot00000000000000 description of graphical overlays --------------------------------- CB - Show Coding Block quadtree structure. Prediction modes are signalled at this level. CBs can be further subdivided into PBs for prediction and TBs for residual transforms. PB - Show Prediction Block structure. CB blocks may be further subdivided, possibly using asymmetric partitionings. This is the level on which motion-compensation and intra-prediction is performed. TB - Show Transformation Block structure. DCT/DSTs are carried out on this level. QP - Quantization Parameter shown as greyscale value. Brighter blocks for larger QP values (lower quality). IntraPred - Show intra prediction mode. * Directional prediction is depicted with a line in the prediction direction (out of 32 possible directions) * Planar prediction is depicted by a square. * DC prediction is depicted by a circle. PredMode - Show prediction mode. * red: intra * blue: inter * green: skip = inter mode with no PB subdivision and candidate from merge list MV - Show motion vectors. Vectors from list L0 are drawn in red, motion vectors from L1 are green. Vectors are magnified by a factor of 4. libde265-1.0.18/sherlock265/VideoDecoder.cc000066400000000000000000000227241515675107500200520ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "VideoDecoder.h" #ifdef HAVE_VIDEOGFX #include #endif #ifdef HAVE_VIDEOGFX using namespace videogfx; #endif //#include "decctx.h" #include "visualize.h" VideoDecoder::VideoDecoder() : mFH(NULL), ctx(NULL), img(NULL), mNextBuffer(0), mFrameCount(0), mPlayingVideo(false), mVideoEnded(false), mSingleStep(false), mShowDecodedImage(true), mShowQuantPY(false), mCBShowPartitioning(false), mTBShowPartitioning(false), mPBShowPartitioning(false), mShowIntraPredMode(false), mShowPBPredMode(false), mShowMotionVec(false), mShowTiles(false), mShowSlices(false) #ifdef HAVE_SWSCALE , sws(NULL) , width(0) , height(0) #endif { } VideoDecoder::~VideoDecoder() { free_decoder(); #ifdef HAVE_SWSCALE if (sws != NULL) { sws_freeContext(sws); } #endif } void VideoDecoder::run() { decoder_loop(); } void VideoDecoder::init(const char* filename) { init_decoder(filename); } void VideoDecoder::startDecoder() { if (mPlayingVideo || mVideoEnded) { return; } mPlayingVideo=true; exit(); } void VideoDecoder::stopDecoder() { if (!mPlayingVideo) { return; } mPlayingVideo=false; } void VideoDecoder::singleStepDecoder() { if (mPlayingVideo || mVideoEnded) { return; } mPlayingVideo=true; mSingleStep=true; exit(); } void VideoDecoder::decoder_loop() { for (;;) { if (mPlayingVideo) { mutex.lock(); if (img) { img = NULL; de265_release_next_picture(ctx); } img = de265_peek_next_picture(ctx); while (img==NULL) { mutex.unlock(); int more=1; de265_error err = de265_decode(ctx, &more); mutex.lock(); if (more && err == DE265_OK) { // try again to get picture img = de265_peek_next_picture(ctx); } else if (more && err == DE265_ERROR_WAITING_FOR_INPUT_DATA) { uint8_t buf[4096]; int buf_size = fread(buf,1,sizeof(buf),mFH); int err = de265_push_data(ctx,buf,buf_size ,0,0); (void)err; } else if (!more) { mVideoEnded=true; mPlayingVideo=false; // TODO: send signal back break; } } // show one decoded picture if (img) { show_frame(img); if (mSingleStep) { mSingleStep=false; mPlayingVideo=false; } } mutex.unlock(); // process events QCoreApplication::processEvents(); } else { exec(); } } } #ifdef HAVE_VIDEOGFX void VideoDecoder::convert_frame_libvideogfx(const de265_image* img, QImage & qimg) { // --- convert to RGB --- de265_chroma chroma = de265_get_chroma_format(img); int map[3]; Image visu; if (chroma == de265_chroma_420) { visu.Create(img->get_width(), img->get_height(), Colorspace_YUV, Chroma_420); map[0]=0; map[1]=1; map[2]=2; } else { visu.Create(img->get_width(), img->get_height(), Colorspace_RGB, Chroma_444); map[0]=1; map[1]=2; map[2]=0; } for (int y=0;yget_height(0);y++) { memcpy(visu.AskFrame(BitmapChannel(map[0]))[y], img->get_image_plane_at_pos(0, 0,y), img->get_width(0)); } for (int y=0;yget_height(1);y++) { memcpy(visu.AskFrame(BitmapChannel(map[1]))[y], img->get_image_plane_at_pos(1, 0,y), img->get_width(1)); } for (int y=0;yget_height(2);y++) { memcpy(visu.AskFrame(BitmapChannel(map[2]))[y], img->get_image_plane_at_pos(2, 0,y), img->get_width(2)); } Image debugvisu; ChangeColorspace(debugvisu, visu, Colorspace_RGB); // --- convert to QImage --- uchar* ptr = qimg.bits(); int bpl = qimg.bytesPerLine(); for (int y=0;yget_height();y++) { for (int x=0;xget_width();x++) { *(uint32_t*)(ptr+x*4) = ((debugvisu.AskFrameR()[y][x] << 16) | (debugvisu.AskFrameG()[y][x] << 8) | (debugvisu.AskFrameB()[y][x] << 0)); } ptr += bpl; } } #endif #ifdef HAVE_SWSCALE void VideoDecoder::convert_frame_swscale(const de265_image* img, QImage & qimg) { if (sws == NULL || img->get_width() != width || img->get_height() != height) { if (sws != NULL) { sws_freeContext(sws); } width = img->get_width(); height = img->get_height(); sws = sws_getContext(width, height, AV_PIX_FMT_YUV420P, width, height, AV_PIX_FMT_BGRA, SWS_FAST_BILINEAR, NULL, NULL, NULL); } int stride[3]; const uint8_t *data[3]; for (int c=0;c<3;c++) { data[c] = img->get_image_plane(c); stride[c] = img->get_image_stride(c); } uint8_t *qdata[1] = { (uint8_t *) qimg.bits() }; int qstride[1] = { qimg.bytesPerLine() }; sws_scale(sws, data, stride, 0, img->get_height(), qdata, qstride); } #endif void VideoDecoder::show_frame(const de265_image* img) { if (mFrameCount==0) { mImgBuffers[0] = QImage(QSize(img->get_width(),img->get_height()), QImage::Format_RGB32); mImgBuffers[1] = QImage(QSize(img->get_width(),img->get_height()), QImage::Format_RGB32); } // --- convert to RGB (or generate a black image if video image is disabled) --- QImage* qimg = &mImgBuffers[mNextBuffer]; uchar* ptr = qimg->bits(); int bpl = qimg->bytesPerLine(); if (mShowDecodedImage) { #ifdef HAVE_VIDEOGFX convert_frame_libvideogfx(img, *qimg); #elif HAVE_SWSCALE convert_frame_swscale(img, *qimg); #else qimg->fill(QColor(0, 0, 0)); #endif } else { qimg->fill(QColor(0, 0, 0)); } // --- overlay coding-mode visualization --- if (mShowQuantPY) { draw_QuantPY(img, ptr, bpl, 4); } if (mShowPBPredMode) { draw_PB_pred_modes(img, ptr, bpl, 4); } if (mShowIntraPredMode) { draw_intra_pred_modes(img, ptr, bpl, 0x009090ff, 4); } if (mTBShowPartitioning) { draw_TB_grid(img, ptr, bpl, 0x00ff6000, 4); } if (mPBShowPartitioning) { draw_PB_grid(img, ptr, bpl, 0x00e000, 4); } if (mCBShowPartitioning) { draw_CB_grid(img, ptr, bpl, 0x00FFFFFF, 4); } if (mShowMotionVec) { draw_Motion(img, ptr, bpl, 4); } if (mShowSlices) { draw_Slices(img, ptr, bpl, 4); } if (mShowTiles) { draw_Tiles(img, ptr, bpl, 4); } emit displayImage(qimg); mNextBuffer = 1-mNextBuffer; mFrameCount++; } void VideoDecoder::showCBPartitioning(bool flag) { mCBShowPartitioning=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showTBPartitioning(bool flag) { mTBShowPartitioning=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showPBPartitioning(bool flag) { mPBShowPartitioning=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showIntraPredMode(bool flag) { mShowIntraPredMode=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showPBPredMode(bool flag) { mShowPBPredMode=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showQuantPY(bool flag) { mShowQuantPY=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showMotionVec(bool flag) { mShowMotionVec=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showDecodedImage(bool flag) { mShowDecodedImage=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showTiles(bool flag) { mShowTiles=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::showSlices(bool flag) { mShowSlices=flag; mutex.lock(); if (img != NULL) { show_frame(img); } mutex.unlock(); } void VideoDecoder::init_decoder(const char* filename) { mFH = fopen(filename,"rb"); //init_file_context(&inputctx, filename); //rbsp_buffer_init(&buf); ctx = de265_new_decoder(); de265_start_worker_threads(ctx, 4); // start 4 background threads } void VideoDecoder::free_decoder() { if (mFH) { fclose(mFH); } if (ctx) { de265_free_decoder(ctx); } } libde265-1.0.18/sherlock265/VideoDecoder.h000066400000000000000000000060321515675107500177060ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef VIDEODECODER_HH #define VIDEODECODER_HH #ifdef HAVE_CONFIG_H #include #endif #include #ifdef HAVE_SWSCALE #ifdef __cplusplus extern "C" { #endif #include #ifdef __cplusplus } #endif #endif #include "VideoWidget.h" #include "de265.h" class VideoDecoder : public QThread { Q_OBJECT public: VideoDecoder(); ~VideoDecoder(); void init(const char* filename); protected: void run(); // thread entry point public slots: void startDecoder(); void stopDecoder(); void singleStepDecoder(); void showCBPartitioning(bool flag); void showTBPartitioning(bool flag); void showPBPartitioning(bool flag); void showIntraPredMode(bool flag); void showPBPredMode(bool flag); void showQuantPY(bool flag); void showMotionVec(bool flag); void showTiles(bool flag); void showSlices(bool flag); void showDecodedImage(bool flag); signals: void displayImage(QImage*); private: // de265 decoder FILE* mFH; //input_context_FILE inputctx; //rbsp_buffer buf; de265_decoder_context* ctx; const de265_image* img; QMutex mutex; QImage mImgBuffers[2]; int mNextBuffer; int mFrameCount; bool mPlayingVideo; bool mVideoEnded; bool mSingleStep; bool mShowDecodedImage; bool mShowQuantPY; bool mCBShowPartitioning; bool mTBShowPartitioning; bool mPBShowPartitioning; bool mShowIntraPredMode; bool mShowPBPredMode; bool mShowMotionVec; bool mShowTiles; bool mShowSlices; void decoder_loop(); void init_decoder(const char* filename); void free_decoder(); void show_frame(const de265_image* img); #ifdef HAVE_VIDEOGFX void convert_frame_libvideogfx(const de265_image* img, QImage & qimg); #endif #ifdef HAVE_SWSCALE SwsContext* sws; int width; int height; void convert_frame_swscale(const de265_image* img, QImage & qimg); #endif }; #endif libde265-1.0.18/sherlock265/VideoPlayer.cc000066400000000000000000000120651515675107500177360ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "VideoPlayer.h" VideoPlayer::VideoPlayer(const char* filename) { mDecoder = new VideoDecoder; mDecoder->init(filename); videoWidget = new VideoWidget; stopButton = new QPushButton("Stop"); //QObject::connect(stopButton, SIGNAL(clicked()), qApp, SLOT(stop())); QObject::connect(stopButton, SIGNAL(clicked()), mDecoder, SLOT(stopDecoder())); startButton = new QPushButton("&Start"); QObject::connect(startButton, SIGNAL(clicked()), mDecoder, SLOT(startDecoder())); QPushButton* stepButton = new QPushButton("Step"); QObject::connect(stepButton, SIGNAL(clicked()), mDecoder, SLOT(singleStepDecoder())); QObject::connect(mDecoder, SIGNAL(displayImage(QImage*)), videoWidget, SLOT(setImage(QImage*)), Qt::QueuedConnection); QPushButton* showCBPartitioningButton = new QPushButton("CB-tree"); showCBPartitioningButton->setCheckable(true); QObject::connect(showCBPartitioningButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showCBPartitioning(bool))); QPushButton* showTBPartitioningButton = new QPushButton("TB-tree"); showTBPartitioningButton->setCheckable(true); QObject::connect(showTBPartitioningButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showTBPartitioning(bool))); QPushButton* showPBPartitioningButton = new QPushButton("PB-tree"); showPBPartitioningButton->setCheckable(true); QObject::connect(showPBPartitioningButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showPBPartitioning(bool))); QPushButton* showIntraPredModeButton = new QPushButton("intra-pred"); showIntraPredModeButton->setCheckable(true); QObject::connect(showIntraPredModeButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showIntraPredMode(bool))); QPushButton* showPBPredModeButton = new QPushButton("PB-mode"); showPBPredModeButton->setCheckable(true); QObject::connect(showPBPredModeButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showPBPredMode(bool))); QPushButton* showQuantPYButton = new QPushButton("Quant"); showQuantPYButton->setCheckable(true); QObject::connect(showQuantPYButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showQuantPY(bool))); QPushButton* showMotionVecButton = new QPushButton("MotionVec"); showMotionVecButton->setCheckable(true); QObject::connect(showMotionVecButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showMotionVec(bool))); QPushButton* showTilesButton = new QPushButton("Tiles"); showTilesButton->setCheckable(true); QObject::connect(showTilesButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showTiles(bool))); QPushButton* showSlicesButton = new QPushButton("Slices"); showSlicesButton->setCheckable(true); QObject::connect(showSlicesButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showSlices(bool))); QPushButton* showDecodedImageButton = new QPushButton("image"); showDecodedImageButton->setCheckable(true); showDecodedImageButton->setChecked(true); QObject::connect(showDecodedImageButton, SIGNAL(toggled(bool)), mDecoder, SLOT(showDecodedImage(bool))); QGridLayout *layout = new QGridLayout; layout->addWidget(videoWidget, 0,0,1,7); layout->addWidget(startButton, 1,0,1,1); layout->addWidget(stopButton, 1,1,1,1); layout->addWidget(stepButton, 1,2,1,1); layout->addWidget(showDecodedImageButton, 1,6,1,1); layout->addWidget(showTilesButton, 1,5,1,1); layout->addWidget(showSlicesButton, 1,4,1,1); layout->addWidget(showCBPartitioningButton,2,0,1,1); layout->addWidget(showTBPartitioningButton,2,1,1,1); layout->addWidget(showPBPartitioningButton,2,2,1,1); layout->addWidget(showIntraPredModeButton, 2,3,1,1); layout->addWidget(showPBPredModeButton, 2,4,1,1); layout->addWidget(showQuantPYButton, 2,5,1,1); layout->addWidget(showMotionVecButton, 2,6,1,1); setLayout(layout); mDecoder->start(); } libde265-1.0.18/sherlock265/VideoPlayer.h000066400000000000000000000030001515675107500175650ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef VIDEOPLAYER_HH #define VIDEOPLAYER_HH #include #include "VideoWidget.h" #include "VideoDecoder.h" class VideoPlayer : public QWidget { Q_OBJECT public: VideoPlayer(const char* filename); private: VideoWidget* videoWidget; QPushButton *startButton; QPushButton *stopButton; VideoDecoder* mDecoder; }; #endif libde265-1.0.18/sherlock265/VideoWidget.cc000066400000000000000000000045421515675107500177260ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "VideoWidget.h" #include VideoWidget::VideoWidget(QWidget *parent) : QWidget(parent), mImg(NULL) { setAutoFillBackground(false); setAttribute(Qt::WA_NoSystemBackground, true); QPalette palette = this->palette(); palette.setColor(QPalette::Window, Qt::black); setPalette(palette); setSizePolicy(QSizePolicy::MinimumExpanding, QSizePolicy::MinimumExpanding); setUpdatesEnabled(true); } VideoWidget::~VideoWidget() { } QSize VideoWidget::sizeHint() const { return QSize(352,288); } void VideoWidget::paintEvent(QPaintEvent *event) { QPainter painter(this); if (mImg) { QRect videoRect = mImg->rect(); videoRect.moveCenter(this->rect().center()); //QRect erect = event->rect(); if (!videoRect.contains(event->rect())) { QRegion region = event->region(); region = region.subtracted(videoRect); QBrush brush = palette().window(); for (const QRect &rect : region) { painter.fillRect(rect, brush); } } painter.drawImage(videoRect, *mImg); } else { painter.fillRect(event->rect(), palette().window()); } } void VideoWidget::resizeEvent(QResizeEvent *event) { QWidget::resizeEvent(event); } libde265-1.0.18/sherlock265/VideoWidget.h000066400000000000000000000032771515675107500175740ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef VIDEOWIDGET_HH #define VIDEOWIDGET_HH #include #if (QT_VERSION >= QT_VERSION_CHECK(5, 0, 0)) #include #else #include #endif class VideoWidget : public QWidget { Q_OBJECT public: VideoWidget(QWidget *parent = 0); ~VideoWidget(); QSize sizeHint() const; public slots: void setImage(QImage* img) { mImg=img; repaint(); } protected: void paintEvent(QPaintEvent *event); void resizeEvent(QResizeEvent *event); private: QImage* mImg; }; #endif libde265-1.0.18/sherlock265/sherlock265.cc000066400000000000000000000030251515675107500175560ustar00rootroot00000000000000/* libde265 example application "sherlock265". MIT License Copyright (c) 2013-2014 struktur AG, Dirk Farin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "VideoPlayer.h" int main(int argc, char **argv) { if (argc != 2) { fprintf(stderr,"usage: sherlock265 videofile.bin\n"); fprintf(stderr,"The video file must be a raw h.265 bitstream (e.g. HM-10.0 output)\n"); exit(5); } QApplication app(argc, argv); VideoPlayer videoPlayer(argv[1]); videoPlayer.show(); return app.exec(); }