pax_global_header00006660000000000000000000000064150226033350014511gustar00rootroot0000000000000052 comment=9916db4d71f79a937013a73698d1ba19f71e740a hipCUB-rocm-6.4.3/000077500000000000000000000000001502260333500135735ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.azuredevops/000077500000000000000000000000001502260333500162205ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.azuredevops/rocm-ci.yml000066400000000000000000000014041502260333500202730ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .gitlab - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .gitlab - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/hipCUB.yml@pipelines_repo hipCUB-rocm-6.4.3/.clang-format000066400000000000000000000113431502260333500161500ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 UseCRLF: false # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: true AlignArrayOfStructures: Right AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true AfterExternBlock: false BeforeCatch: true BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: All BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: true IndentExternBlock: NoIndent IndentPPDirectives: BeforeHash IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PPIndentWidth: -1 PackConstructorInitializers: NextLine PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left QualifierAlignment: Leave ReferenceAlignment: Pointer ReflowComments: false ShortNamespaceLines: 0 SortIncludes: CaseSensitive SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false AttributeMacros: - __host__ - __device__ - __global__ - __forceinline__ - __shared__ - __launch_bounds__ - HIPCUB_DEVICE - HIPCUB_HOST - HIPCUB_HOST_DEVICE - HIPCUB_SHARED_MEMORY - HIPCUB_RUNTIME_FUNCTION - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS # Trick clang into thinking that our C-style attributes are C++-style attributes # Make sure that the sizes line up for linebreaks etc Macros: - __host__=[[host]] - __device__=[[device]] - __global__=[[global]] - __forceinline__=[[forceinline]] - __shared__=[[shared]] - __launch_bounds__(x)=[[launch_bounds(x)]] - __attribute__(x)=[[attribute(x)]] - HIPCUB_DEVICE=[[DEVICE___]] - HIPCUB_HOST=[[HOST___]] - HIPCUB_HOST_DEVICE=[[HOST_DEVICE___]] - HIPCUB_SHARED_MEMORY=[[SHARED_MEMORY___]] - HIPCUB_RUNTIME_FUNCTION=[[RUNTIME_FUNCTION___]] - HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS=[[DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS___]] BreakAfterAttributes: Always --- hipCUB-rocm-6.4.3/.githooks/000077500000000000000000000000001502260333500155005ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.githooks/install000077500000000000000000000002121502260333500170670ustar00rootroot00000000000000#!/bin/sh cd "$(git rev-parse --git-dir)" cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" hipCUB-rocm-6.4.3/.githooks/pre-commit000077500000000000000000000015421502260333500175040ustar00rootroot00000000000000#!/bin/sh # Redirect output to stderr. exec 1>&2 check_failed=false # Do the code format check if ! "$(git rev-parse --show-toplevel)/scripts/code-format/check-format.sh" HEAD 1>&2; then printf "\n\033[31mFailed\033[0m: code format check.\n" check_failed=true fi # Do the copyright check # update & apply copyright when hook config is set, otherwise just verify opts="-qc" if [ "$(git config --get --type bool --default false hooks.updateCopyright)" = "true" ]; then opts="-qca" fi if ! "$(git rev-parse --show-toplevel)/scripts/copyright-date/check-copyright.sh" "$opts" 1>&2; then printf "\n\033[31mFailed\033[0m: copyright date check.\n" check_failed=true fi if $check_failed; then printf " Pre-commit check failed, please fix the reported errors. Note: Use '\033[33mgit commit --no-verify\033[0m' to bypass checks.\n" exit 1 fi hipCUB-rocm-6.4.3/.github/000077500000000000000000000000001502260333500151335ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.github/CODEOWNERS000077500000000000000000000003261502260333500165320ustar00rootroot00000000000000* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13 # Documentation files docs/* @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation hipCUB-rocm-6.4.3/.github/dependabot.yml000066400000000000000000000012231502260333500177610ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" hipCUB-rocm-6.4.3/.github/workflows/000077500000000000000000000000001502260333500171705ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.github/workflows/docs.yaml000066400000000000000000000045551502260333500210150ustar00rootroot00000000000000name: Upload to the upload server # Controls when the workflow will run on: push: branches: [develop, master] tags: - rocm-5.* release: types: [published] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: getting branch name shell: bash run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: branch_name - name: getting tag name shell: bash run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})" id: tag_name - name: zipping files run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*' - name: echo-step run: echo "${{ github.event.release.target_commitish }}" - name: uploading archive to prod if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.PROD_UPLOAD_URL }}' args: '-o ConnectTimeout=5' - name: uploading archive to staging if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.STG_UPLOAD_URL }}' args: '-o ConnectTimeout=5' hipCUB-rocm-6.4.3/.gitignore000066400000000000000000000011431502260333500155620ustar00rootroot00000000000000 ### Build dirs ### build/ # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build ### VSCODE ### .vscode .devcontainer # End of https://www.gitignore.io/api/c++,cmake hipCUB-rocm-6.4.3/.gitlab-ci.yml000066400000000000000000000274601502260333500162400ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. include: - project: amd/ci-templates ref: main file: - /defaults.yaml - /deps-cmake.yaml - /deps-docs.yaml - /deps-format.yaml - /deps-rocm.yaml - /deps-nvcc.yaml - /deps-compiler-acceleration.yaml - /gpus-rocm.yaml - /gpus-nvcc.yaml - /rules.yaml stages: - lint - build - test - benchmark clang-format: extends: - .lint:clang-format copyright-date: extends: - .deps:rocm stage: lint needs: [] tags: - build rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: - cd $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR - scripts/copyright-date/check-copyright.sh -v -d $CI_MERGE_REQUEST_DIFF_BASE_SHA # hipCUB with rocPRIM backend .rocm: variables: ROCPRIM_GIT_BRANCH: "develop_stream" extends: - .deps:rocm - .deps:cmake-minimum - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:compiler-acceleration", before_script] # Install rocPRIM from git - BRANCH_NAME="$ROCPRIM_GIT_BRANCH" - if [ "$CI_COMMIT_BRANCH" = develop -o "$CI_COMMIT_BRANCH" = master ]; then BRANCH_NAME="$CI_COMMIT_BRANCH" - fi; - git clone -b "$BRANCH_NAME" --depth 1 https://gitlab-ci-token:${CI_JOB_TOKEN}@${ROCPRIM_GIT_URL} $CI_PROJECT_DIR/rocPRIM - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF -D ROCM_DEP_ROCMCORE=OFF -D GPU_TARGETS="$GPU_TARGETS" -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD=14 -B $CI_PROJECT_DIR/rocPRIM/build -S $CI_PROJECT_DIR/rocPRIM - cd $CI_PROJECT_DIR/rocPRIM/build - cpack -G "DEB" - $SUDO_CMD dpkg -i rocprim*.deb build:rocm: extends: - .rocm - .gpus:rocm-gpus - .rules:build stage: build tags: - build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-error=pass-failed" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D GPU_TARGETS="$GPU_TARGETS" -D GPU_TEST_TARGETS="$GPU_TARGETS" -D ROCM_SYMLINK_LIBS=OFF -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD="$BUILD_VERSION" -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build - cd $CI_PROJECT_DIR/build - cpack -G "DEB;ZIP" artifacts: paths: - $CI_PROJECT_DIR/build/test/hipcub/test_* - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake - $CI_PROJECT_DIR/build/test/hipcub/CTestTestfile.cmake - $CI_PROJECT_DIR/build/gtest/ - $CI_PROJECT_DIR/build/CMakeCache.txt - $CI_PROJECT_DIR/build/CTestTestfile.cmake - $CI_PROJECT_DIR/build/hipcub*.deb - $CI_PROJECT_DIR/build/hipcub*.zip - $CI_PROJECT_DIR/build/.ninja_log expire_in: 2 weeks parallel: matrix: - BUILD_VERSION: [14, 17] build:rocm-benchmark: extends: - .rocm - .gpus:rocm-gpus - .rules:build stage: build tags: - build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_BENCHMARK=ON -D GPU_TARGETS="$GPU_TARGETS" -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD=14 -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build artifacts: paths: - $CI_PROJECT_DIR/build/benchmark/* - $CI_PROJECT_DIR/build/deps/googlebenchmark/ - $CI_PROJECT_DIR/build/.ninja_log - $CI_PROJECT_DIR/build/CMakeCache.txt expire_in: 2 weeks test:rocm: stage: test needs: - build:rocm extends: - .rocm - .gpus:rocm - .rules:test script: - cd $CI_PROJECT_DIR/build - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_STANDARD=14 -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. # This causes the hipMemcpy to fail, which is not reported as an error by HIP. # As a temporary workaround, disable the SDMA for test stability. - HSA_ENABLE_SDMA=0 ctest --output-on-failure --repeat-until-fail 2 --tests-regex "$GPU_TARGET" --resource-spec-file ./resources.json --parallel $PARALLEL_JOBS .benchmark: stage: benchmark variables: BENCHMARK_FILENAME_REGEX: ^benchmark BENCHMARK_FILTER_REGEX: "" script: - python3 ${CI_PROJECT_DIR}/.gitlab/run_and_upload_benchmarks.py --api_endpoint ${BENCHMARK_API_ENDPOINT} --api_base_folder_id ${BENCHMARK_API_FOLDER_ID} --api_auth_token ${BENCHMARK_API_AUTH_TOKEN} --benchmark_dir ${CI_PROJECT_DIR}/build/benchmark --benchmark_datetime ${CI_PIPELINE_CREATED_AT} --benchmark_version ${CI_COMMIT_REF_SLUG}_MR${CI_MERGE_REQUEST_IID}_${CI_COMMIT_SHORT_SHA} --benchmark_gpu_name "${GPU}" --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" --benchmark_filter_regex "${BENCHMARK_FILTER_REGEX}" --no_upload artifacts: paths: - ${CI_PROJECT_DIR}/build/benchmark/*.json expire_in: 1 week benchmark:rocm: extends: - .rocm - .benchmark - .gpus:rocm - .rules:benchmark needs: - build:rocm-benchmark .test_package: script: - | if [[ -n $GPU_TARGETS ]]; then GPU_TARGETS_ARG="-DGPU_TARGETS=$GPU_TARGETS" else GPU_TARGETS_ARG="" fi - cmake -G Ninja -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" "$GPU_TARGETS_ARG" -D CMAKE_CXX_STANDARD=14 -S $CI_PROJECT_DIR/test/extra -B $CI_PROJECT_DIR/build/package_test - cmake --build $CI_PROJECT_DIR/build/package_test - cd $CI_PROJECT_DIR/build/package_test - ctest --output-on-failure --repeat-until-fail 2 .test:package: script: - cd $CI_PROJECT_DIR/build - $SUDO_CMD dpkg -i ${HIPCUB_DEV_PACKAGE_WILDCARD} - export CXX - !reference [".test_package", script] - $SUDO_CMD dpkg -r rocprim-dev hipcub-dev .test:install: script: - export CXX - cmake -G Ninja -D BUILD_TEST=OFF -D CMAKE_CXX_STANDARD=14 -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build_only_install # Preserve $PATH when sudoing - $SUDO_CMD env PATH="$PATH" cmake --install $CI_PROJECT_DIR/build_only_install - !reference [".test_package", script] test:rocm_package: stage: test needs: - build:rocm variables: CXX: "$AMDCLANG" HIPCUB_DEV_PACKAGE_WILDCARD: hipcub-dev*.deb tags: - rocm extends: - .rocm - .gpus:rocm-gpus - .test:package - .rules:test test:rocm_install: stage: test needs: [] variables: CXX: "$AMDCLANG" tags: - rocm extends: - .rocm - .gpus:rocm-gpus - .test:install - .rules:test # hipCUB with CUB backend .nvcc: extends: - .deps:nvcc - .gpus:nvcc-gpus - .deps:cmake-minimum - .deps:compiler-acceleration before_script: - !reference [".deps:nvcc", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:compiler-acceleration", before_script] build:nvcc: stage: build extends: - .nvcc - .rules:build tags: - build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D NVGPU_TARGETS="$GPU_TARGETS" -D ROCM_SYMLINK_LIBS=OFF -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda -D CMAKE_CXX_STANDARD="$BUILD_VERSION" -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build - cd $CI_PROJECT_DIR/build - cpack -G "DEB;ZIP" artifacts: paths: - $CI_PROJECT_DIR/build/test/hipcub/test_* - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake - $CI_PROJECT_DIR/build/test/hipcub/CTestTestfile.cmake - $CI_PROJECT_DIR/build/gtest/ - $CI_PROJECT_DIR/build/CMakeCache.txt - $CI_PROJECT_DIR/build/CTestTestfile.cmake - $CI_PROJECT_DIR/build/hipcub*.deb - $CI_PROJECT_DIR/build/hipcub*.zip - $CI_PROJECT_DIR/build/.ninja_log expire_in: 2 weeks parallel: matrix: - BUILD_VERSION: [14, 17] build:nvcc-benchmark: stage: build extends: - .nvcc - .rules:build tags: - build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_BENCHMARK=ON -D NVGPU_TARGETS="$GPU_TARGETS" -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda -D CMAKE_CXX_STANDARD=14 -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build artifacts: paths: - $CI_PROJECT_DIR/build/benchmark/* - $CI_PROJECT_DIR/build/deps/googlebenchmark/ - $CI_PROJECT_DIR/build/.ninja_log - $CI_PROJECT_DIR/build/CMakeCache.txt expire_in: 2 weeks test:nvcc: stage: test needs: - build:nvcc extends: - .nvcc - .gpus:nvcc - .rules:test before_script: # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script - !reference [.nvcc, before_script] script: - cd $CI_PROJECT_DIR/build - ctest --output-on-failure --repeat-until-fail 2 benchmark:nvcc: needs: - build:nvcc-benchmark extends: - .nvcc - .gpus:nvcc - .benchmark - .rules:benchmark before_script: # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script - !reference [.nvcc, before_script] test:nvcc_package: stage: test needs: - build:nvcc variables: HIPCUB_DEV_PACKAGE_WILDCARD: hipcub_nvcc-dev*.deb tags: - nvcc extends: - .nvcc - .test:package - .rules:test test:nvcc_install: stage: test needs: [] tags: - nvcc extends: - .nvcc - .test:install - .rules:test test:doc: stage: test variables: SPHINX_DIR: $DOCS_DIR/sphinx extends: - .rules:test - .build:docs scheduled-check-changes: extends: .rules:scheduled-check-changes hipCUB-rocm-6.4.3/.gitlab/000077500000000000000000000000001502260333500151135ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.gitlab/cmake-run-benchmarks.txt000066400000000000000000000002301502260333500216440ustar00rootroot00000000000000file(GLOB Benchmarks "${BENCHMARK_BINARY_DIR}/benchmark_*") foreach(Benchmark IN LISTS Benchmarks) execute_process(COMMAND ${Benchmark}) endforeach() hipCUB-rocm-6.4.3/.gitlab/run_and_upload_benchmarks.py000066400000000000000000000174171502260333500226660ustar00rootroot00000000000000import argparse from collections import namedtuple from datetime import datetime import json import os import re import stat import subprocess from urllib.parse import urljoin import urllib.request BenchmarkContext = namedtuple('BenchmarkContext', ['run_datetime', 'version', 'gpu_name', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex']) ApiContext = namedtuple('ApiContext', ['endpoint', 'folder_id', 'auth_token']) def run_benchmarks(benchmark_context): def is_benchmark_executable(filename): if not re.match(benchmark_context.benchmark_filename_regex, filename): return False path = os.path.join(benchmark_context.benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether there is any execution flag set # and it is a regular file (S_IFREG) return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG) success = True benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)] json_paths = [] for benchmark_name in benchmark_names: results_json_name = f'{benchmark_name}_{benchmark_context.version}_{benchmark_context.gpu_name}.json' benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name) results_json_path = os.path.join(benchmark_context.benchmark_dir, results_json_name) args = [ benchmark_path, '--benchmark_out_format=json', f'--benchmark_out={results_json_path}', f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' ] try: subprocess.call(args) json_paths.append(results_json_path) except OSError as error: print(f'Could not run benchmark at {benchmark_path}. Error: "{error}"') success = False return success, json_paths def write_system_info(): def try_running_info(executable_name): out_filename = f'{executable_name}.txt' try: run_result = subprocess.run(executable_name, stdout=subprocess.PIPE) if run_result.returncode == 0: with open(out_filename, 'wb') as file: file.write(run_result.stdout) return out_filename except OSError: # Expected, when the executable is not available on the system pass rocminfo_filename = try_running_info('rocminfo') if rocminfo_filename: return rocminfo_filename else: return try_running_info('deviceQuery') def create_benchmark_folder(benchmark_context, api_context): formatted_datetime = datetime.strftime(benchmark_context.run_datetime, '%Y%m%d_%H%M%S') new_folder_name = f'{formatted_datetime}_{benchmark_context.version}_{benchmark_context.gpu_name}' create_folder_url = urljoin(api_context.endpoint, f'files/folder/{api_context.folder_id}') create_folder_payload = json.dumps({ 'title': new_folder_name }).encode('utf-8') create_folder_headers = { 'Content-Type': 'application/json', 'Authorization': api_context.auth_token } create_folder_request = urllib.request.Request( url=create_folder_url, data=create_folder_payload, headers=create_folder_headers, method='POST') try: with urllib.request.urlopen(create_folder_request) as response: response_data = json.loads(response.read()) new_folder_id = response_data['response']['id'] print(f"Created new folder with id {new_folder_id}") return new_folder_id except Exception as ex: print(f'Could not create folder "{new_folder_name}". Error: {ex}') return None def upload_results(folder_id, api_context, paths_to_upload): success = True upload_file_url = urljoin(api_context.endpoint, f'files/{folder_id}/upload') for path in paths_to_upload: with open(path) as file: body_bytes = file.read().encode('utf-8') filename = os.path.basename(path) upload_file_headers = { 'Content-Type': 'text/plain', 'Content-Disposition': f'attachment; filename="{filename}"', 'Authorization': api_context.auth_token } upload_file_request = urllib.request.Request(url=upload_file_url, data=body_bytes, headers=upload_file_headers, method='POST') try: with urllib.request.urlopen(upload_file_request): pass print(f'Uploaded {path}') except Exception as ex: print(f'Could not upload file "{path}". Error: {ex}') success = False return success def parse_date(date_str): """ Parses the date format provided by GitLab's builtin variable CI_PIPELINE_CREATED_AT """ return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ') def main(): parser = argparse.ArgumentParser() parser.add_argument('--api_endpoint', help='URL that specifies the file storage API endpoint. For example: https://website.com/api/2.0/', required=True) parser.add_argument('--api_base_folder_id', help='The ID of the remote folder to which the benchmark results are uploaded', required=True) parser.add_argument('--api_auth_token', help='The authentication token string required by the remote API', required=True) parser.add_argument('--benchmark_dir', help='The local directory that contains the benchmark executables', required=True) parser.add_argument('--benchmark_datetime', help='The datetime string that specifies the creation date of the benchmarks. For example: "2022-03-28T13:16:09Z"', required=True) parser.add_argument('--benchmark_version', help='The identifier of the source control version of the benchmarked source code. For example a commit hash.', required=True) parser.add_argument('--benchmark_gpu_name', help='The name of the currently enabled GPU', required=True) parser.add_argument('--benchmark_filename_regex', help='Regular expression that controls the list of benchmark executables to run', default=r'^benchmark', required=False) parser.add_argument('--benchmark_filter_regex', help='Regular expression that controls the list of benchmarks to run in each benchmark executable', default='', required=False) parser.add_argument('--no_upload', help='Only run the benchmarks, do not upload them', default=False, action='store_true', required=False) args = parser.parse_args() api_context = ApiContext(args.api_endpoint, args.api_base_folder_id, args.api_auth_token) benchmark_context = BenchmarkContext( parse_date(args.benchmark_datetime), args.benchmark_version, args.benchmark_gpu_name, args.benchmark_dir, args.benchmark_filename_regex, args.benchmark_filter_regex) status = True benchmark_run_successful, to_upload_paths = run_benchmarks(benchmark_context) status = status and benchmark_run_successful sysinfo_path = write_system_info() if sysinfo_path: # not required to be successful. # Not all rocm/nvidia images have rocminfo/deviceQuery in their path to_upload_paths.append(sysinfo_path) if not args.no_upload: upload_successful = False folder_id = create_benchmark_folder(benchmark_context, api_context) if folder_id is not None: upload_successful = upload_results(folder_id, api_context, to_upload_paths) status = status and upload_successful return status if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) hipCUB-rocm-6.4.3/.jenkins/000077500000000000000000000000001502260333500153125ustar00rootroot00000000000000hipCUB-rocm-6.4.3/.jenkins/common.groovy000066400000000000000000000037721502260333500200620ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true) { project.paths.construct_build_prefix() String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' def getRocPRIM = auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg) def command = """#!/usr/bin/env bash set -x ${getRocPRIM} cd ${project.paths.project_build_prefix} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project) { String sudo = auxiliary.sudo(platform.jenkinsLabel) def testCommand = "ctest --output-on-failure --verbose --timeout 900" def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} cd ${project.testDirectory} ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib ${testCommand} """ platform.runCommand(this, command) } def runPackageCommand(platform, project) { def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) } return this hipCUB-rocm-6.4.3/.jenkins/precheckin.groovy000066400000000000000000000045561502260333500207060ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCm/rocJENKINS/ @Library('rocJenkins@pong') _ // This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('hipCUB', 'PreCheckin') prj.timeout.compile = 400 // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) auxiliary.registerDependencyBranchParameter(["rocPRIM"]) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } Set seenJobNames = [] jobNameList.each { jobName, nodeDetails-> seenJobNames.add(jobName) if (urlJobName == jobName) runCI(nodeDetails, jobName) } // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901 if(!seenJobNames.contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) runCI([ubuntu16:['gfx906']], urlJobName) } } hipCUB-rocm-6.4.3/.jenkins/staticanalysis.groovy000066400000000000000000000023201502260333500216110ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCm/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() } def runCI = { nodeDetails, jobName-> def prj = new rocProject('hipCUB', 'StaticAnalysis') // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false boolean staticAnalysis = true def compileCommand = { platform, project-> runCompileCommand(platform, project, jobName, false) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])])) stage(urlJobName) { runCI([ubuntu20:['any']], urlJobName) } } hipCUB-rocm-6.4.3/.jenkins/staticlibrary.groovy000066400000000000000000000045641502260333500214460ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('hipCUB', 'Static Library PreCheckin') def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, false, true) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } hipCUB-rocm-6.4.3/.readthedocs.yaml000066400000000000000000000005441502260333500170250ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "3.10" apt_packages: - "doxygen" hipCUB-rocm-6.4.3/CHANGELOG.md000066400000000000000000000313311502260333500154050ustar00rootroot00000000000000# Changelog for hipCUB Full documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). ## hipCUB-3.4.0 for ROCm 6.4.0 ### Added * Added regression tests to `rtest.py`. These tests recreate scenarios that have caused hardware problems in past emulation environments. Use `python rtest.py [--emulation|-e|--test|-t]=regression` to run these tests. * Added extended tests to `rtest.py`. These tests are extra tests that did not fit the criteria of smoke and regression tests. These tests will take much longer to run relative to smoke and regression tests. Use `python rtest.py [--emulation|-e|--test|-t]=extended` to run these tests. * Added `ForEach`, `ForEachN`, `ForEachCopy`, `ForEachCopyN` and `Bulk` functions to have parity with CUB. * Added the `hipcub::CubVector` type for CUB parity. * Added `--emulation` option for `rtest.py` * Unit tests can be run with `[--emulation|-e|--test|-t]=` * Added `DeviceSelect::FlaggedIf` and its inplace overload. * Added CUB macros missing from hipCUB: `HIPCUB_MAX`, `HIPCUB_MIN`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST`. * Added `hipcub::AliasTemporaries` function for CUB parity. ### Changed * Removed usage of `std::unary_function` and `std::binary_function` in `test_hipcub_device_adjacent_difference.cpp` * Changed the subset of tests that are run for smoke tests such that the smoke test will complete with faster run-time and to never exceed 2GB of vram usage. Use `python rtest.py [--emulation|-e|--test|-t]=smoke` to run these tests. * The `rtest.py` options have changed. `rtest.py` is now run with at least either `--test|-t` or `--emulation|-e`, but not both options. * The NVIDIA backend now requires CUB, Thrust and libcu++ 2.5.0. If it is not found it will be downloaded from the NVIDIA CCCL repository. * Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release. ### Known issues * When building on Windows using HIP SDK for ROCm 6.4, ``hipMalloc`` returns ``hipSuccess`` even when the size passed to it is too large and the allocation fails. Because of this, limits have been set for the maximum test case sizes for some unit tests such as HipcubDeviceRadixSort's SortKeysLargeSizes . ## hipCUB 3.3.0 for ROCm 6.3.0 ### Added * Support for large indices in `hipcub::DeviceSegmentedReduce::*` has been added, with the exception of `DeviceSegmentedReduce::Arg*`. Although rocPRIM's backend provides support for all reduce variants, CUB does not support large indices in `DeviceSegmentedReduce::Arg*`. For this reason, large index support is not available for `hipcub::DeviceSegmentedReduce::Arg*`. ### Changed * Changed the default value of `rmake.py -a` to `default_gpus`. This is equivalent to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201`. * The NVIDIA backend now requires CUB, Thrust, and libcu++ 2.3.2. ### Resolved issues * Fixed an issue in `rmake.py` where the list storing cmake options would contain individual characters instead of a full string of options. * Fixed an issue where `config.hpp` was not included in all hipCUB headers, resulting in build errors. ## hipCUB-3.2.0 for ROCm 6.2.0 ### Added * Add `DeviceCopy` function to have parity with CUB. * In the rocPRIM backend, added `enum WarpExchangeAlgorithm`, which is used as the new optional template argument for `WarpExchange`. * The potential values for the enum are `WARP_EXCHANGE_SMEM` and `WARP_EXCHANGE_SHUFFLE`. * `WARP_EXCHANGE_SMEM` stands for the previous algorithm, while `WARP_EXCHANGE_SHUFFLE` performs the exchange via shuffle operations. * `WARP_EXCHANGE_SHUFFLE` does not require any pre-allocated shared memory, but the `ItemsPerThread` must be a divisor of `WarpSize`. * Added `tuple.hpp` which defines templates `hipcub::tuple`, `hipcub::tuple_element`, `hipcub::tuple_element_t` and `hipcub::tuple_size`. * Added new overloaded member functions to `BlockRadixSort` and `DeviceRadixSort` that expose a `decomposer` argument. Keys of a custom type (`key_type`) can be sorted via these overloads, if an appropriate decomposer is passed. The decomposer has to implement `operator(const key_type&)` which returns a `hipcub::tuple` of references pointing to members of `key_type`. * On AMD GPUs (using the HIP backend), it is possible to issue hipCUB API calls inside of hipGraphs, with several exceptions: * CachingDeviceAllocator * GridBarrierLifetime * DeviceSegmentedRadixSort * DeviceRunLengthEncode Currently, these classes rely on one or more synchronous calls to function correctly. Because of this, they cannot be used inside of hipGraphs. ### Changed * The NVIDIA backend now requires CUB, Thrust and libcu++ 2.2.0. If it is not found it will be downloaded from the NVIDIA CCCL repository. ### Fixed * Fixed the derivation for the accumulator type for device scan algorithms in the rocPRIM backend being different compared to CUB. It now derives the accumulator type as the result of the binary operator. * `debug_synchronous` has been deprecated in hipCUB-2.13.2, and it no longer has any effect. With this release, passing `debug_synchronous` to the device functions results in a deprecation warning both at runtime and at compile time. * The synchronization that was previously achievable by passing `debug_synchronous=true` can now be achieved at compile time by setting the `CUB_DEBUG_SYNC` (or higher debug level) or the `HIPCUB_DEBUG_SYNC` preprocessor definition. * The compile time deprecation warnings can be disabled by defining the `HIPCUB_IGNORE_DEPRECATED_API` preprocessor definition. ## hipCUB-3.1.0 for ROCm 6.1.0 ### Changes * CUB backend references CUB and Thrust version 2.1.0 * Updated the `HIPCUB_HOST_WARP_THREADS` macro definition to match `host_warp_size` changes from rocPRIM 3.0 * Implemented `__int128_t` and `__uint128_t` support for `radix_sort` ### Fixes * Build issues with `rmake.py` on Windows when using Visual Studio 2017 15.8 or later (due to a breaking fix with extended aligned storage) ### Additions * Interface `DeviceMemcpy::Batched` for batched memcpy from rocPRIM and CUB ## hipCUB-3.0.0 for ROCm 6.0.0 ### Changes * Removed `DOWNLOAD_ROCPRIM` * You can force rocPRIM to download using `DEPENDENCIES_FORCE_DOWNLOAD` ## hipCUB-2.13.2 for ROCm 5.7.0 ### Changes * CUB backend references CUB and Thrust version 2.0.1. * Fixed `DeviceSegmentedReduce::ArgMin` and `DeviceSegmentedReduce::ArgMax` by returning the segment-relative index instead of the absolute one * Fixed `DeviceSegmentedReduce::ArgMin` for inputs where the segment minimum is smaller than the value returned for empty segments; an equivalent fix is applied to `DeviceSegmentedReduce::ArgMax` ### Known issues * `debug_synchronous` no longer works on the CUDA platform; use `CUB_DEBUG_SYNC` instead * `DeviceReduce::Sum` doesn't compile on the CUDA platform for mixed extended-floating-point or floating-point InputT and OutputT types * `DeviceHistogram::HistogramEven` fails on CUDA platform for `[LevelT, SampleIteratorT] = [int, int]`. * `DeviceHistogram::MultiHistogramEven` fails on CUDA platform for `[LevelT, SampleIteratorT] = [int, int/unsigned short/float/double]` and `[LevelT, SampleIteratorT] = [float, double]` ## hipCUB-2.13.1 for ROCm 5.5.0 ### Additions * Benchmarks for `BlockShuffle`, `BlockLoad`, and `BlockStore` ### Changes * The CUB backend references CUB and Thrust version 1.17.2 * Improved benchmark coverage for: * `BlockScan` by adding `ExclusiveScan` * `BlockRadixSort` by adding `SortBlockedToStriped` * `WarpScan` by adding `Broadcast` * Removed references to, and workarounds for, the deprecated hcc ### Known issues * `BlockRadixRankMatch` is currently broken for the rocPRIM backend * `BlockRadixRankMatch` with a warp size that does not divide exactly by the block size is broken for the CUB backend ## hipCUB-2.13.0 for ROCm 5.4.0 ### Additions * CMake functionality improves build parallelism for the test suite that splits compilation units by function or parameters * New overload for `BlockAdjacentDifference::SubtractLeftPartialTile`, which takes a predecessor item ### Changes * Improved build parallelism of the test suite by splitting up large compilation units for `DeviceRadixSort`, `DeviceSegmentedRadixSort`, and `DeviceSegmentedSort` * The CUB backend references CUB and Thrust version 1.17.1 ### Known issues * `BlockRadixRankMatch` is currently broken for the rocPRIM backend * `BlockRadixRankMatch` with a warp size that does not divide exactly by the block size is broken for the CUB backend ## hipCUB-2.12.0 for ROCm 5.3.0 ### Additions * `UniqueByKey` device algorithm * `SubtractLeft`, `SubtractLeftPartialTile`, `SubtractRight`, and `SubtractRightPartialTile` overload in `BlockAdjacentDifference` * The old overloads (`FlagHeads`, `FlagTails`, `FlagHeadsAndTails`) are deprecated * `DeviceAdjacentDifference` algorithm * Extended benchmark suite of `DeviceHistogram`, `DeviceScan`, `DevicePartition`, `DeviceReduce`, `DeviceSegmentedReduce`, `DeviceSegmentedRadixSort`, `DeviceRadixSort`, `DeviceSpmv`, `DeviceMergeSort`, and `DeviceSegmentedSort` ### Changes * Obsolete type traits defined in `util_type.hpp`; use the standard library equivalents instead * CUB backend references CUB and Thrust version 1.16.0 * `DeviceRadixSort` `num_items` parameter type is now templated instead of being an int * If an integral type with a maximum size of 4 bytes is passed (an int), the former logic applies; otherwise, the algorithm uses a larger indexing type that makes it possible to sort input data over $2^{32}$ elements ## hipCUB-2.11.1 for ROCm 5.2.0 ### Additions * Packages for tests and benchmark executables on all supported operating systems using CPack ## hipCUB-2.11.0 for ROCm 5.1.0 ### Additions * Device segmented sort * `WarpMergeSort`, `WarpMask`, and thread sort from CUB 1.15.0 are supported in hipCUB * Device three-way partition ### Changes * `device_scan` and `device_segmented_scan`: `inclusive_scan` now uses the `input-type` as `accumulator-type`; `exclusive_scan` uses `initial-value-type`. * This changes the behavior of: * Small-size input types with large-size output types (e.g., short input, int output) * Low-res input with high-res output (e.g., float input, double output) * Block merge sort no longer supports non-power of two block sizes ### Known issues * Grid unit test hangs on HIP for Windows ## hipCUB-2.10.13 for ROCm 5.0.0 ### Fixes * Added missing includes to `hipcub.hpp` ### Additions * Bfloat16 support to test cases (`device_reduce` and `device_radix_sort`) * Device merge sort * Block merge sort * API update to CUB 1.14.0 ### Changes * The `SetupNVCC.cmake` automatic target selector selects all of the capabilities for all available cards with the NVIDIA backend ## hipCUB-2.10.12 for ROCm 4.5.0 ### Additions * Initial HIP on Windows support ### Changes * Packaging changed to a development package (named `hipcub-dev` for `.deb` packages and `hipcub-devel` for `.rpm` packages). Because hipCUB is a header-only library, there is no runtime package. To aid in the transition, the development package sets the `provides` field to `hipcub`, so existing packages that are dependent on hipCUB can continue to work. This `provides` feature is introduced as a deprecated feature because it will be removed in a future ROCm release. ## hipCUB-2.10.11 for ROCm 4.4.0 ### Additions * gfx1030 support added * AddressSanitizer build option ### Fixes * `BlockRadixRank` unit test failure ## hipCUB-2.10.10 for ROCm 4.3.0 ### Additions * `DiscardOutputIterator` to backend header ## hipCUB-2.10.9 for ROCm 4.2.0 ### Additions * Support for `TexObjInputIterator` and `TexRefInputIterator` * Support for `DevicePartition` ### Changes * The minimum CMake version required is now 3.10.2 * The CUB backend has been updated to 1.11.0 ### Fixes * Benchmark build * NVCC build ## hipCUB-2.10.8 for ROCm 4.1.0 ### Added * Support for `DiscardOutputIterator` ## hipCUB-2.10.7 for ROCm 4.0.0 * No changes ## hipCUB-2.10.6 for ROCm 3.10 * No changes ## hipCUB-2.10.5 for ROCm 3.9.0 * No changes ## hipCUB-2.10.4 for ROCm 3.8.0 * No changes ## hipCUB-2.10.3 for ROCm 3.7.0 * No changes ## hipCUB-2.10.2 for ROCm 3.6.0 * No changes ## hipCUB-2.10.1 for ROCm 3.5.0 ### Additions * Improved tests with fixed and random seeds for test data ### Changes * Switched to hip-clang as default compiler * CMake searches for rocPRIM locally first and, if not found, downloads it from GitHub ### Deprecations * HCC build ### Known issues * The following unit test failures (due to issues in ROCclr runtime) have been observed: * `BlockDiscontinuity` * `BlockExchange` * `BlockHistogram` * `BlockRadixSort` * `BlockReduce` * `BlockScan` hipCUB-rocm-6.4.3/CMakeLists.txt000066400000000000000000000157061502260333500163440ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.25) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") # hipCUB project project(hipcub LANGUAGES CXX) # Set CXX flags if (NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) if (CMAKE_CXX_STANDARD EQUAL 14) message(WARNING "C++14 will be deprecated in the next major release") elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) message(FATAL_ERROR "Only C++14 and C++17 are supported") endif() # Set HIP flags set(CMAKE_HIP_STANDARD 14) set(CMAKE_HIP_STANDARD_REQUIRED ON) set(CMAKE_HIP_EXTENSIONS OFF) include(CheckLanguage) include(CMakeDependentOption) # Build options option(BUILD_TEST "Build tests (requires googletest)" OFF) option(DEPENDENCIES_FORCE_DOWNLOAD "Download dependencies and do not search for packages" OFF) option(DOWNLOAD_CUB "Download CUB and thrust. Do not search for CUB package" OFF) option(BUILD_BENCHMARK "Build benchmarks" OFF) option(BUILD_EXAMPLE "Build Examples" OFF) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) check_language(HIP) cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) # Set the ROCM install directory. if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() # Set the header wrapper ON by default. option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) # Add hipCUB's CMake modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to 'Release' as none was specified.") set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") # rocm-cmake has to be included early so that it's available to set GPU_TARGETS # If hip is included prior to setting that then it defaults to building only for the current architecture include(ROCmCMakeBuildToolsDependency) # Setup GPU targets for rocm platform if(USE_HIPCXX) enable_language(HIP) else() # Setup GPU targets for rocm platform if(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")) if(NOT DEFINED AMDGPU_TARGETS) set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") else() set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") endif() set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") if(GPU_TARGETS STREQUAL "all") if(BUILD_ADDRESS_SANITIZER) # ASAN builds require xnack rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" ) else() rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" ) endif() set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() endif() endif() # Find and verify HIP. include(VerifyCompiler) # Get dependencies (except rocm-cmake, included earlier) include(Dependencies) if(BUILD_ADDRESS_SANITIZER) add_compile_options(-fsanitize=address -shared-libasan) add_link_options(-fuse-ld=lld) endif() # Setup VERSION set(VERSION_STRING "3.4.0") rocm_setup_version(VERSION ${VERSION_STRING}) # Print configuration summary include(cmake/Summary.cmake) print_configuration_summary() # hipCUB library add_subdirectory(hipcub) if(BUILD_TEST OR (BUILD_BENCHMARK AND NOT ONLY_INSTALL)) rocm_package_setup_component(clients) endif() # Tests if(BUILD_TEST) enable_testing() rocm_package_setup_client_component(tests) add_subdirectory(test) endif() # Examples if(BUILD_EXAMPLE) add_subdirectory(examples) endif() # Benchmarks if(BUILD_BENCHMARK AND NOT ONLY_INSTALL) rocm_package_setup_client_component(benchmarks) add_subdirectory(benchmark) endif() # Create header wrapper for backward compatibility if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( ${PROJECT_SOURCE_DIR}/hipcub/include/hipcub/ PATTERNS "*.h" PATTERN "*.hpp" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS cub/${CMAKE_INSTALL_INCLUDEDIR}/hipcub/ OUTPUT_LOCATIONS cub/wrapper/include/hipcub/ ) endif() # Package if(HIP_COMPILER STREQUAL "clang") rocm_package_add_deb_dependencies(DEPENDS "rocprim-dev >= 2.10.1") rocm_package_add_rpm_dependencies(DEPENDS "rocprim-devel >= 2.10.1") set(CPACK_DEBIAN_PACKAGE_REPLACES "cub-hip") set(CPACK_RPM_PACKAGE_OBSOLETES "cub-hip") else() rocm_package_add_dependencies(DEPENDS "hip-dev >= 4.4") endif() set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") set(CPACK_RPM_PACKAGE_LICENSE "BSD") # if(NOT CPACK_PACKAGING_INSTALL_PREFIX) # set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") # endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}") if(HIP_COMPILER STREQUAL "clang") rocm_create_package( NAME hipcub DESCRIPTION "hipCUB (rocPRIM backend)" MAINTAINER "hipcub-maintainer@amd.com" HEADER_ONLY ) else() rocm_create_package( NAME hipcub_nvcc DESCRIPTION "hipCUB (CUB backend)" MAINTAINER "hipcub-maintainer@amd.com" HEADER_ONLY ) endif() hipCUB-rocm-6.4.3/CONTRIBUTING.md000066400000000000000000000134361502260333500160330ustar00rootroot00000000000000 # Contributing to hipCUB # We welcome contributions to hipCUB. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgement for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## The purpose of hipCUB is to provide a thin wrapper library on top of rocPRIM or CUB. This wrapper allows users to port CUB projects to HIP so that they can also be run on AMD hardware. Because it is a wrapper, the implementations of the algorithms that hipCUB API calls are not contained within hipCUB. They exist within rocPRIM and CUB - the hipCUB API functions simply delegate the work to these underlying libraries. This delegation should be performed in a manner that minimizes overhead. When a pull request is created, a number of automated checks are run. These checks: - test the change on various OS platforms (Ubuntu, RHEL, etc.) - run on different GPU architectures (MI-series, Radeon series cards, etc.) - run benchmarks to check for performance degredation In order for change to be accepted: - it must pass all of the automated checks - it must undergo a code review The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes, before raising pull requests. ## Code Structure ## hipCUB is a header-only library. Library code lives inside of /hipcub/include. Code is organized by the level-of-scope at which it operates. For example, the following subdirectories are organized by hardware-level scope: * device/ contains headers for device-level algorithms * block/ contains headers for block-level algorithms * warp/ contains headers for warp/wavefront-level algorithms The following subdirectories are organized according to software-scope level: * grid/ contains headers for grid-level operations (barriers, queues, etc.) * thread/ contains headers for thread-level operations (load/store, scan, reduce, etc.) Finally, the iterator/ subdirectory provides access to the iterators that are used to interact with most algorithms in the library. Back at the root level, you can find tests and benchmarks located inside directories of the same name. There is also an examples/ folder that contains a number of sample API use cases. ## Coding Style ## C and C++ code should be formatted using `clang-format`. Use the clang-format version for Clang 9, which is available in the `/opt/rocm` directory. Please do not use your system's built-in `clang-format`, as this is an older version that will have different results. To format a file, use: ``` /opt/rocm/hcc/bin/clang-format -style=file -i ``` To format all files, run the following script in hipCUB directory: ``` #!/bin/bash git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/hcc/bin/clang-format -style=file -i ``` Also, githooks can be installed to format the code per-commit: ``` ./.githooks/install ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follows the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch, which serves as our integration branch. Releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers. ### Deliverables ### Code that introduces new features should have test coverage and benchmark coverage. hipCUB tests are located in the test/hipcub/ directory, while benchmarks can be found in the benchmark/ directory. ### Process ### After you create a PR, you can take a look at a diff of the changes you made using the PR's "Files" tab. PRs must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. When a modification request has been completed, the conversation thread about it will be marked as resolved. To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. hipCUB-rocm-6.4.3/LICENSE.txt000066400000000000000000000032061502260333500154170ustar00rootroot00000000000000Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019-2021, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. hipCUB-rocm-6.4.3/NOTICES.txt000066400000000000000000000117621502260333500154470ustar00rootroot00000000000000Notices and Licenses file _________________________ AMD copyrighted code (MIT) Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. crascit-downloadproject v-u (MIT) Copyright (c) 2015 Crascit Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Dependencies on nvlabs-cub v1.8 (BSD3) Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ROCmSoftwarePlatform-rocPRIM v1.0.0 (MIT) Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hipCUB-rocm-6.4.3/README.md000066400000000000000000000146701502260333500150620ustar00rootroot00000000000000# hipCUB > [!NOTE] > The published documentation is available at [hipCUB](https://rocm.docs.amd.com/projects/hipCUB/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). hipCUB is a thin wrapper library on top of [rocPRIM](https://github.com/ROCm/rocPRIM) or [CUB](https://github.com/thrust/cub). You can use it to port a CUB project into [HIP](https://github.com/ROCm/HIP) so you can use AMD hardware (and [ROCm](https://rocm.docs.amd.com/en/latest/) software). In the [ROCm](https://rocm.docs.amd.com/en/latest/) environment, hipCUB uses the rocPRIM library as the backend. On CUDA platforms, it uses CUB as the backend. ## Documentation Documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). To build our documentation locally, run the following code: ```shell # Go to the hipCUB docs directory cd hipCUB; cd docs # Install required pip packages python3 -m pip install -r .sphinx/requirements.txt # Build the documentation python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html # For e.g. serve the HTML docs locally cd _build/html python3 -m http.server ``` ## Requirements * Git * CMake (3.16 or later) * For AMD GPUs: * AMD [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) software (1.8.0 or later) * The [HIP-clang](https://github.com/ROCm/HIP/blob/master/INSTALL.md#hip-clang) compiler (you must, set this as the C++ compiler for ROCm) * The [rocPRIM](https://github.com/ROCm/rocPRIM) library * Automatically downloaded and built by the CMake script * Requires CMake 3.16.9 or later * For NVIDIA GPUs: * CUDA Toolkit * CCCL library (>= 2.5.0) * Automatically downloaded and built by the CMake script * Requires CMake 3.15.0 or later * Python 3.6 or higher (for HIP on Windows only; this is only required for install scripts) * Visual Studio 2019 with Clang support (HIP on Windows only) * Strawberry Perl (HIP on Windows only) Optional: * [GoogleTest](https://github.com/google/googletest) * [Google Benchmark](https://github.com/google/benchmark) GoogleTest and Google Benchmark are automatically downloaded and built by the CMake script. ## Build and install To build and install hipCub, run the following code: ```shell git clone https://github.com/ROCm/hipCUB.git # Go to hipCUB directory, create and go to the build directory. cd hipCUB; mkdir build; cd build # Configure hipCUB, setup options for your system. # Build options: # BUILD_TEST - OFF by default, # BUILD_BENCHMARK - OFF by default. # DEPENDENCIES_FORCE_DOWNLOAD - OFF by default and at ON the dependencies will be downloaded to build folder, # # ! IMPORTANT ! # Set C++ compiler to HIP-aware clang. You can do it by adding 'CXX=' # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. # [CXX=hipcc] cmake ../. # or cmake-gui ../. # To configure hipCUB for Nvidia platforms, 'CXX=', `CXX=nvcc` or omitting the flag # entirely before 'cmake' is sufficient [CXX=nvcc] cmake -DBUILD_TEST=ON ../. # or cmake-gui ../. # or cmake -DBUILD_TEST=ON ../. # or cmake-gui ../. # or to build benchmarks cmake -DBUILD_BENCHMARK=ON ../. # Build make -j4 # Optionally, run tests if they're enabled. ctest --output-on-failure # Package make package # Install [sudo] make install ``` ### HIP on Windows Initial support for HIP on Windows is available. You can install it using the provided `rmake.py` Python script: ```shell git clone https://github.com/ROCm/hipCUB.git cd hipCUB # the -i option will install rocPRIM to C:\hipSDK by default python rmake.py -i # the -c option will build all clients including unit tests python rmake.py -c ``` ### Using hipCUB To use hipCUB in a CMake project, we recommended using the package configuration files. ```cmake # On ROCm hipCUB requires rocPRIM find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") # "/opt/rocm" - default install prefix find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/hipcub") ... # On ROCm: includes hipCUB headers and roc::rocprim_hip target # On CUDA: includes only hipCUB headers, user has to include CUB directory target_link_libraries( hip::hipcub) ``` Include only the main header file: ```cpp #include ``` Depending on your current HIP platform, hipCUB includes CUB or rocPRIM headers. ## Running unit tests ```shell # Go to hipCUB build directory cd hipCUB; cd build # To run all tests ctest # To run unit tests for hipCUB ./test/hipcub/ ``` ### Using custom seeds for the tests Go to the `hipCUB/test/hipcub/test_seed.hpp` file. ```cpp //(1) static constexpr int random_seeds_count = 10; //(2) static constexpr unsigned int seeds [] = {0, 2, 10, 1000}; //(3) static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); ``` (1) Defines a constant that sets how many passes are performed over the tests with runtime-generated seeds. Modify at will. (2) Defines the user-generated seeds. Each of the elements of the array are used as seeds for all tests. Modify at will. If no static seeds are desired, leave the array empty. ```cpp static constexpr unsigned int seeds [] = {}; ``` (3) Never modified this line. ## Running benchmarks ```shell # Go to hipCUB build directory cd hipCUB; cd build # To run benchmark for warp functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_warp_ [--size ] [--trials ] # To run benchmark for block functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_block_ [--size ] [--trials ] # To run benchmark for device functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_device_ [--size ] [--trials ] ``` ## Support Bugs and feature requests can be reported through the [GitHub issue tracker](https://github.com/ROCm/hipCUB/issues). ## Contributing Contributions are most welcome! Learn more at [CONTRIBUTING](./CONTRIBUTING.md). hipCUB-rocm-6.4.3/benchmark/000077500000000000000000000000001502260333500155255ustar00rootroot00000000000000hipCUB-rocm-6.4.3/benchmark/CMakeLists.txt000066400000000000000000000114321502260333500202660ustar00rootroot00000000000000# MIT License # # Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(add_hipcub_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) if(USE_HIPCXX) set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE HIP) endif() add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) target_include_directories(${BENCHMARK_TARGET} SYSTEM BEFORE PUBLIC "$" ) target_link_libraries(${BENCHMARK_TARGET} PRIVATE benchmark::benchmark hipcub ) if((HIP_COMPILER STREQUAL "nvcc")) set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 14) set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE CUDA) target_compile_options(${BENCHMARK_TARGET} PRIVATE $<$:--expt-extended-lambda> ) target_link_libraries(${BENCHMARK_TARGET} PRIVATE hipcub_cub ) endif() set_target_properties(${BENCHMARK_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" ) rocm_install(TARGETS ${BENCHMARK_TARGET} COMPONENT benchmarks) if (WIN32 AND NOT DEFINED DLLS_COPIED) set(DLLS_COPIED "YES") set(DLLS_COPIED ${DLLS_COPIED} PARENT_SCOPE) # for now adding in all .dll as dependency chain is not cmake based on win32 file( GLOB third_party_dlls LIST_DIRECTORIES ON CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll ${CMAKE_SOURCE_DIR}/rtest.* ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET ${BENCHMARK_TARGET} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${file_i} ${PROJECT_BINARY_DIR}/benchmark ) endforeach( file_i ) endif() endfunction() # **************************************************************************** # Benchmarks # **************************************************************************** add_hipcub_benchmark(benchmark_block_adjacent_difference.cpp) add_hipcub_benchmark(benchmark_block_discontinuity.cpp) add_hipcub_benchmark(benchmark_block_exchange.cpp) add_hipcub_benchmark(benchmark_block_histogram.cpp) add_hipcub_benchmark(benchmark_block_merge_sort.cpp) add_hipcub_benchmark(benchmark_block_radix_sort.cpp) add_hipcub_benchmark(benchmark_block_radix_rank.cpp) add_hipcub_benchmark(benchmark_block_reduce.cpp) add_hipcub_benchmark(benchmark_block_run_length_decode.cpp) add_hipcub_benchmark(benchmark_block_scan.cpp) add_hipcub_benchmark(benchmark_block_shuffle.cpp) add_hipcub_benchmark(benchmark_device_adjacent_difference.cpp) add_hipcub_benchmark(benchmark_device_batch_copy.cpp) add_hipcub_benchmark(benchmark_device_batch_memcpy.cpp) add_hipcub_benchmark(benchmark_device_for.cpp) add_hipcub_benchmark(benchmark_device_histogram.cpp) add_hipcub_benchmark(benchmark_device_memory.cpp) add_hipcub_benchmark(benchmark_device_merge_sort.cpp) add_hipcub_benchmark(benchmark_device_partition.cpp) add_hipcub_benchmark(benchmark_device_radix_sort.cpp) add_hipcub_benchmark(benchmark_device_reduce_by_key.cpp) add_hipcub_benchmark(benchmark_device_reduce.cpp) add_hipcub_benchmark(benchmark_device_run_length_encode.cpp) add_hipcub_benchmark(benchmark_device_scan.cpp) add_hipcub_benchmark(benchmark_device_segmented_sort.cpp) add_hipcub_benchmark(benchmark_device_segmented_radix_sort.cpp) add_hipcub_benchmark(benchmark_device_segmented_reduce.cpp) add_hipcub_benchmark(benchmark_device_select.cpp) add_hipcub_benchmark(benchmark_device_spmv.cpp) add_hipcub_benchmark(benchmark_warp_exchange.cpp) add_hipcub_benchmark(benchmark_warp_load.cpp) add_hipcub_benchmark(benchmark_warp_reduce.cpp) add_hipcub_benchmark(benchmark_warp_scan.cpp) add_hipcub_benchmark(benchmark_warp_store.cpp) add_hipcub_benchmark(benchmark_warp_merge_sort.cpp) hipCUB-rocm-6.4.3/benchmark/benchmark_block_adjacent_difference.cpp000066400000000000000000000376301502260333500253510ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_adjacent_difference.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif template __global__ __launch_bounds__(BlockSize) void kernel(Args... args) { Benchmark::template run(args...); } template struct minus { HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a - b; } }; struct subtract_left { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); } else { adjacent_difference.SubtractLeft(input, output, minus{}); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct subtract_left_partial_tile { template __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size, T(123)); } else { adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct subtract_right { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractRight(input, output, minus{}, T(123)); } else { adjacent_difference.SubtractRight(input, output, minus{}); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct subtract_right_partial_tile { template __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; adjacent_difference.SubtractRightPartialTile(input, output, minus{}, tile_size); for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value && !std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, Trials); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value || std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); const std::vector tile_sizes = benchmark_utils::get_random_data(num_blocks, 0, items_per_block); T* d_input; int* d_tile_sizes; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_tile_sizes, tile_sizes.data(), tile_sizes.size() * sizeof(tile_sizes[0]), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_tile_sizes, d_output, Trials); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_tile_sizes)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ std::string("block_adjacent_difference.sub_algorithm_name:" \ + name + "") \ .c_str(), \ &run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block, with_tile) \ CREATE_BENCHMARK(type, block, 1, with_tile), CREATE_BENCHMARK(type, block, 3, with_tile), \ CREATE_BENCHMARK(type, block, 4, with_tile), CREATE_BENCHMARK(type, block, 8, with_tile), \ CREATE_BENCHMARK(type, block, 16, with_tile), CREATE_BENCHMARK(type, block, 32, with_tile) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = {BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(float, 256, false), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(double, 256, false)}; if(!std::is_same::value) { bs.insert(bs.end(), {BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(float, 256, true), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(long long, 256, true), BENCHMARK_TYPE(double, 256, true)}); } benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_block_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("subtract_left", benchmarks, stream, size); add_benchmarks("subtract_right", benchmarks, stream, size); add_benchmarks("subtract_left_partial_tile", benchmarks, stream, size); add_benchmarks("subtract_right_partial_tile", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; }hipCUB-rocm-6.4.3/benchmark/benchmark_block_discontinuity.cpp000066400000000000000000000260321502260333500243250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_discontinuity.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif template struct custom_flag_op1 { HIPCUB_HOST_DEVICE bool operator()(const T& a, const T& b) const { return (a == b); } }; template __global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, T* d_output) { Runner::template run(d_input, d_output); } struct flag_heads { template __device__ static void run(const T* d_input, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); } else { bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct flag_tails { template __device__ static void run(const T* d_input, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); } else { bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += tail_flags[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct flag_heads_and_tails { template __device__ static void run(const T* d_input, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), input, hipcub::Equality()); } else { bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; input[i] += tail_flags[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ std::string("block_discontinuity.sub_algorithm_name:" \ + name + ".") \ .c_str(), \ &run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block, bool) \ CREATE_BENCHMARK(type, block, 1, bool), CREATE_BENCHMARK(type, block, 2, bool), \ CREATE_BENCHMARK(type, block, 3, bool), CREATE_BENCHMARK(type, block, 4, bool), \ CREATE_BENCHMARK(type, block, 8, bool) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(uint8_t, 256, false), BENCHMARK_TYPE(uint8_t, 256, true), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(long long, 256, true), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_discontinuity" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("flag_heads", benchmarks, stream, size); add_benchmarks("flag_tails", benchmarks, stream, size); add_benchmarks("flag_heads_and_tails", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_exchange.cpp000066400000000000000000000334351502260333500232070ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_exchange.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, const unsigned int* d_ranks, T* d_output) { Runner::template run(d_input, d_ranks, d_output); } struct blocked_to_striped { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.BlockedToStriped(input, input); __syncthreads(); // extra sync needed because of loop. In normal usage // sync with be cared for by the load and store functions // (outside the loop). } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct striped_to_blocked { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.StripedToBlocked(input, input); __syncthreads(); // extra sync needed because of loop. In normal usage // sync with be cared for by the load and store functions // (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } }; struct blocked_to_warp_striped { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.BlockedToWarpStriped(input, input); __syncthreads(); // extra sync needed because of loop. In normal usage // sync with be cared for by the load and store functions // (outside the loop). } hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); } }; struct warp_striped_to_blocked { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.WarpStripedToBlocked(input, input); __syncthreads(); // extra sync needed because of loop. In normal usage // sync with be cared for by the load and store functions // (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } }; struct scatter_to_blocked { template __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.ScatterToBlocked(input, input, ranks); __syncthreads(); // extra sync needed because of loop. In normal usage // sync with be cared for by the load and store functions // (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } }; struct scatter_to_striped { template __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.ScatterToStriped(input, input, ranks); __syncthreads(); // extra sync needed because of loop. In normal usage // sync with be cared for by the load and store functions // (outside the loop). } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input(size); // Fill input for(size_t i = 0; i < size; i++) { input[i] = T(i); } std::vector ranks(size); // Fill ranks (for scatter operations) std::mt19937 gen; for(size_t bi = 0; bi < size / items_per_block; bi++) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } T* d_input; unsigned int* d_ranks; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_ranks, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_ranks)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_exchange.sub_algorithm_name:" \ + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 7), CREATE_BENCHMARK(type, block, 8) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(custom_float2, 256), BENCHMARK_TYPE(custom_double2, 256), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_exchange" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("blocked_to_striped", benchmarks, stream, size); add_benchmarks("striped_to_blocked", benchmarks, stream, size); add_benchmarks("blocked_to_warp_striped", benchmarks, stream, size); add_benchmarks("warp_striped_to_blocked", benchmarks, stream, size); add_benchmarks("scatter_to_blocked", benchmarks, stream, size); add_benchmarks("scatter_to_striped", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_histogram.cpp000066400000000000000000000202721502260333500234150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_histogram.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct histogram { template __device__ static void run(const T* input, T* output) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; unsigned int global_offset = hipBlockIdx_x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[index + k]; } using bhistogram_t = hipcub::BlockHistogram; __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t(storage).Histogram(values, histogram); } #pragma unroll for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) { output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x]; global_offset += BlockSize; } } } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_histogram.method_name:" + method_name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { std::vector new_benchmarks = {BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(unsigned long long, 256), BENCHMARK_TYPE(unsigned long long, 320)}; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_histogram" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // using_atomic using histogram_a_t = histogram; add_benchmarks(benchmarks, "histogram", "using_atomic", stream, size); // using_sort using histogram_s_t = histogram; add_benchmarks(benchmarks, "histogram", "using_sort", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_merge_sort.cpp000066400000000000000000000236471502260333500235770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "../test/hipcub/test_utils_sort_comparator.hpp" // HIP API #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs }; template __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output, CompareOp compare_op) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; hipcub::LoadDirectStriped(lid, input + block_offset, keys); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; sort.Sort(keys, compare_op); } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output, CompareOp compare_op) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; hipcub::LoadDirectStriped(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = keys[i] + T(1); } #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; sort.Sort(keys, values, compare_op); } for(unsigned int i = 0; i < ItemsPerThread; i++) { keys[i] += values[i]; } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, CompareOp()); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, CompareOp()); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_merge_sort.sub_algorithm_name:" \ + name) \ .c_str(), \ &run_benchmark, \ benchmark_kind, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8) void add_benchmarks(benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = {BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 512), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_merge_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_radix_rank.cpp000066400000000000000000000216011502260333500235370ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_radix_rank.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/block/radix_rank_sort_operations.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class RadixRankAlgorithm { RADIX_RANK_BASIC, RADIX_RANK_MEMOIZE, RADIX_RANK_MATCH, }; template __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, int* ranks_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); using KeyTraits = hipcub::Traits; using UnsignedBits = typename KeyTraits::UnsignedBits; using DigitExtractor = hipcub::BFEDigitExtractor; UnsignedBits(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); using RankType = std::conditional_t< BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MATCH, hipcub::BlockRadixRankMatch, hipcub::BlockRadixRank>; #pragma unroll for(unsigned int key = 0; key < ItemsPerThread; key++) { unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); } int ranks[ItemsPerThread]; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { __shared__ typename RankType::TempStorage storage; RankType rank(storage); unsigned begin_bit = 0; const unsigned end_bit = sizeof(T) * 8; while(begin_bit < end_bit) { const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); DigitExtractor digit_extractor(begin_bit, pass_bits); rank.RankKeys(unsigned_keys, ranks, digit_extractor); begin_bit += RadixBits; } } hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); T* d_input; int* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(int))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME( rank_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, KIND, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_radix_rank." \ + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ CREATE_BENCHMARK(type, RadixRankAlgorithm::RADIX_RANK_BASIC, block, ipt), \ CREATE_BENCHMARK(type, RadixRankAlgorithm::RADIX_RANK_MEMOIZE, block, ipt), \ CREATE_BENCHMARK(type, RadixRankAlgorithm::RADIX_RANK_MATCH, block, ipt) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK_KINDS(type, block, 1), \ CREATE_BENCHMARK_KINDS(type, block, 4), \ CREATE_BENCHMARK_KINDS(type, block, 8), \ CREATE_BENCHMARK_KINDS(type, block, 16), \ CREATE_BENCHMARK_KINDS(type, block, 32) // clang-format on void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_block_radix_rank" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("rank", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_radix_sort.cpp000066400000000000000000000316371502260333500236050ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_radix_sort.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs }; struct helper_blocked_blocked { template HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) { hipcub::LoadDirectStriped(linear_id, block_iter, items); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.Sort(keys); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.Sort(keys, values); } template HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) { using custom_t = benchmark_utils::custom_type; hipcub::BlockRadixSort sort; sort.Sort(keys, benchmark_utils::custom_type_decomposer{}); } template HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], benchmark_utils::custom_type (&values)[ItemsPerThread]) { using custom_t = benchmark_utils::custom_type; hipcub::BlockRadixSort sort; sort.Sort(keys, values, benchmark_utils::custom_type_decomposer{}); } }; struct helper_blocked_striped { template HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) { hipcub::LoadDirectBlocked(linear_id, block_iter, items); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.SortBlockedToStriped(keys); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.SortBlockedToStriped(keys, values); } template HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread]) { using custom_t = benchmark_utils::custom_type; hipcub::BlockRadixSort sort; sort.SortBlockedToStriped(keys, benchmark_utils::custom_type_decomposer{}); } template HIPCUB_DEVICE static void sort(benchmark_utils::custom_type (&keys)[ItemsPerThread], benchmark_utils::custom_type (&values)[ItemsPerThread]) { using custom_t = benchmark_utils::custom_type; hipcub::BlockRadixSort sort; sort.SortBlockedToStriped(keys, values, benchmark_utils::custom_type_decomposer{}); } }; template __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; Helper::template load(lid, input + block_offset, keys); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys); } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; Helper::template load(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = keys[i] + T(1); } #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys, values); } for(unsigned int i = 0; i < ItemsPerThread; i++) { keys[i] += values[i]; } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { sort_keys_kernel <<>>(d_input, d_output); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { sort_pairs_kernel <<>>(d_input, d_output); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_radix_sort.sub_algorithm_name:" \ + name) \ .c_str(), \ &run_benchmark, \ benchmark_kind, \ stream, \ size) // clang-format off #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8) // clang-format on template void add_benchmarks(benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_int_t = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), BENCHMARK_TYPE(custom_int_t, 64), BENCHMARK_TYPE(custom_int_t, 128), BENCHMARK_TYPE(custom_int_t, 192), BENCHMARK_TYPE(custom_int_t, 256), BENCHMARK_TYPE(custom_int_t, 320), BENCHMARK_TYPE(custom_int_t, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_radix_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // clang-format off add_benchmarks( benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_keys, "sort_to_striped(keys)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_pairs, "sort_to_striped(keys, values)", benchmarks, stream, size); // clang-format on // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_reduce.cpp000066400000000000000000000207751502260333500226770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_reduce.hpp" #include "hipcub/thread/thread_operators.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct reduce { template __device__ static void run(const T* input, T* output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; T values[ItemsPerThread]; T reduced_value; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); values[0] = reduced_value; } if(hipThreadIdx_x == 0) { output[hipBlockIdx_x] = reduced_value; } } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_reduce.method_name:" + method_name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(uint8_t, 256), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // using_warp_scan using reduce_uwr_t = reduce; add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size); // raking reduce using reduce_rr_t = reduce; add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size); // raking reduce commutative only using reduce_rrco_t = reduce; add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_run_length_decode.cpp000066400000000000000000000232461502260333500250740ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_run_length_decode.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, const OffsetT* d_run_offsets, ItemT* d_decoded_items, bool enable_store = false) { using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode; ItemT run_items[RunsPerThread]; OffsetT run_offsets[RunsPerThread]; const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); const OffsetT total_decoded_size = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; #pragma nounroll for(unsigned i = 0; i < Trials; ++i) { OffsetT decoded_window_offset = 0; while(decoded_window_offset < total_decoded_size) { ItemT decoded_items[DecodedItemsPerThread]; block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset); if(enable_store) { hipcub::StoreDirectBlocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items); } decoded_window_offset += BlockSize * DecodedItemsPerThread; } } } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto runs_per_block = BlockSize * RunsPerThread; const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); std::vector run_items(num_runs); std::vector run_offsets(num_runs + 1); std::default_random_engine prng(std::random_device{}()); using ItemDistribution = std::conditional_t::value, std::uniform_int_distribution, std::uniform_real_distribution>; ItemDistribution run_item_dist(0, 100); std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); for(size_t i = 0; i < num_runs; ++i) { run_items[i] = run_item_dist(prng); } for(size_t i = 1; i < num_runs + 1; ++i) { const OffsetT next_run_length = run_length_dist(prng); run_offsets[i] = run_offsets[i - 1] + next_run_length; } const OffsetT output_length = run_offsets.back(); ItemT* d_run_items{}; HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); HIP_CHECK(hipMemcpy(d_run_items, run_items.data(), run_items.size() * sizeof(ItemT), hipMemcpyHostToDevice)); OffsetT* d_run_offsets{}; HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); HIP_CHECK(hipMemcpy(d_run_offsets, run_offsets.data(), run_offsets.size() * sizeof(OffsetT), hipMemcpyHostToDevice)); ItemT* d_output{}; HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel), dim3(num_runs / runs_per_block), dim3(BlockSize), 0, stream, d_run_items, d_run_offsets, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); state.SetItemsProcessed(state.iterations() * output_length * Trials); HIP_CHECK(hipFree(d_run_items)); HIP_CHECK(hipFree(d_run_offsets)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ benchmark::RegisterBenchmark( \ std::string("block_run_length_decode.") \ .c_str(), \ &run_benchmark, \ stream, \ size) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_run_length_decode" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_scan.cpp000066400000000000000000000235421502260333500223470ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // hipCUB API #include "hipcub/block/block_scan.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) { Runner::template run(input, output, init); } template struct inclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { (void)init; const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template struct exclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, input[0]); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(std::string("block_scan.method_name:" + method_name) \ .c_str(), \ &run_benchmark, \ stream, \ size) // clang-format off #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) // clang-format on template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(uint8_t, 256), CREATE_BENCHMARK(custom_float2, 256, 1), CREATE_BENCHMARK(custom_float2, 256, 4), CREATE_BENCHMARK(custom_float2, 256, 8), CREATE_BENCHMARK(custom_double2, 256, 1), CREATE_BENCHMARK(custom_double2, 256, 4), CREATE_BENCHMARK(custom_double2, 256, 8), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_scan" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // clang-format off add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_RAKING", stream, size); add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_RAKING_MEMOIZE", stream, size); add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_WARP_SCANS", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_RAKING", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_RAKING_MEMOIZE", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_WARP_SCANS", stream, size); // clang-format on // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_block_shuffle.cpp000066400000000000000000000274441502260333500230640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "hipcub/block/block_shuffle.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } struct offset { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T value = input[tid]; using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Offset(value, value, 1); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } output[tid] = value; } static constexpr bool uses_ipt = false; }; struct rotate { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T value = input[tid]; using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Rotate(value, value, 1); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } output[tid] = value; } static constexpr bool uses_ipt = false; }; struct up { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = input[ItemsPerThread * tid + i]; } using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Up(values, values); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } for(unsigned int i = 0; i < ItemsPerThread; i++) { output[ItemsPerThread * tid + i] = values[i]; } } static constexpr bool uses_ipt = true; }; struct down { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = input[ItemsPerThread * tid + i]; } using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Down(values, values); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } for(unsigned int i = 0; i < ItemsPerThread; i++) { output[ItemsPerThread * tid + i] = values[i]; } } static constexpr bool uses_ipt = true; }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input(size, T(1)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK_IPT(BS, IPT) \ benchmark::RegisterBenchmark( \ ("block_shuffle.sub_algorithm_name:" + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define CREATE_BENCHMARK(BS) \ benchmark::RegisterBenchmark(("block_shuffle.sub_algorithm_name:" + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) template = true> void add_benchmarks_type(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size, const std::string& type_name) { std::vector bs = { CREATE_BENCHMARK_IPT(256, 1), CREATE_BENCHMARK_IPT(256, 3), CREATE_BENCHMARK_IPT(256, 4), CREATE_BENCHMARK_IPT(256, 8), CREATE_BENCHMARK_IPT(256, 16), CREATE_BENCHMARK_IPT(256, 32), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } template = true> void add_benchmarks_type(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size, const std::string& type_name) { std::vector bs = { CREATE_BENCHMARK(256), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_BENCHMARKS(T) add_benchmarks_type(name, benchmarks, stream, size, #T) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; CREATE_BENCHMARKS(int); CREATE_BENCHMARKS(float); CREATE_BENCHMARKS(double); CREATE_BENCHMARKS(int8_t); CREATE_BENCHMARKS(long long); CREATE_BENCHMARKS(custom_float2); CREATE_BENCHMARKS(custom_double2); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_shuffle" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("offset", benchmarks, stream, size); add_benchmarks("rotate", benchmarks, stream, size); add_benchmarks("up", benchmarks, stream, size); add_benchmarks("down", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_adjacent_difference.cpp000066400000000000000000000241411502260333500255070ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #include "common_benchmark_header.hpp" #include #include "cmdparser.hpp" #include #include #include #include #include #include #include namespace { #ifndef DEFAULT_N constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; #endif constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractLeft(temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractRight(temporary_storage, storage_size, input, std::forward(args)...); } template void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream) { using output_type = T; // Generate data const std::vector input = benchmark_utils::get_random_data(size, 1, 100); T* d_input; output_type* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK( hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); if(copy) { HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); } static constexpr std::integral_constant left_tag; static constexpr std::integral_constant copy_tag; // Allocate temporary storage std::size_t temp_storage_size{}; void* d_temp_storage = nullptr; const auto launch = [&] { return dispatch_adjacent_difference(left_tag, copy_tag, d_temp_storage, temp_storage_size, d_input, d_output, size, hipcub::Sum{}, stream); }; HIP_CHECK(launch()); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipDeviceSynchronize()); // Run for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); if(copy) { HIP_CHECK(hipFree(d_output)); } HIP_CHECK(hipFree(d_temp_storage)); } } // namespace using namespace std::string_literals; #define CREATE_BENCHMARK(T, left, copy) \ benchmark::RegisterBenchmark(std::string("device_adjacent_difference" \ "." \ "sub_algorithm_name:subtract_" \ + std::string(left ? "left" : "right") \ + std::string(copy ? "_copy" : "")) \ .c_str(), \ &run_benchmark, \ size, \ stream) // clang-format off #define CREATE_BENCHMARKS(T) \ CREATE_BENCHMARK(T, true, false), \ CREATE_BENCHMARK(T, true, true), \ CREATE_BENCHMARK(T, false, false), \ CREATE_BENCHMARK(T, false, true) // clang-format on int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP const hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_device_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks const std::vector benchmarks = { CREATE_BENCHMARKS(int), CREATE_BENCHMARKS(std::int64_t), CREATE_BENCHMARKS(uint8_t), CREATE_BENCHMARKS(float), CREATE_BENCHMARKS(double), CREATE_BENCHMARKS(custom_float2), CREATE_BENCHMARKS(custom_double2), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_batch_copy.cpp000066400000000000000000000420511502260333500236770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark/benchmark.h" #include "cmdparser.hpp" #include "common_benchmark_header.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/device/device_copy.hpp" #include "hipcub/hipcub.hpp" #include #include #include #include #include #include #include #include #include #include constexpr uint32_t warmup_size = 5; constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = 128; constexpr int32_t blev_min_size = 1024; // Used for generating offsets. We generate a permutation map and then derive // offsets via a sum scan over the sizes in the order of the permutation. This // allows us to keep the order of buffers we pass to batch_copy, but still // have source and destinations mappings not be the identity function: // // batch_copy( // [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, // d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d // └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! // ───┬─── ─────┬───── ───┬─── ─── // └─────────┼─────────┼───┐ // ┌───┘ ┌───┘ │ what batch_copy does // ▼ ▼ ▼ // ─── ─────────── ─────── ─────── // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ template std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) { const auto n = input.size(); assert(n > 0); std::vector result(n); std::vector permute(n); std::iota(permute.begin(), permute.end(), 0); std::shuffle(permute.begin(), permute.end(), rng); for(T i = 0, sum = 0; i < n; ++i) { result[permute[i]] = sum; sum += input[permute[i]]; } return result; } using offset_type = size_t; template struct BatchCopyData { size_t total_num_elements = 0; ValueType* d_input = nullptr; ValueType* d_output = nullptr; ValueType** d_buffer_srcs = nullptr; ValueType** d_buffer_dsts = nullptr; BufferSizeType* d_buffer_sizes = nullptr; BatchCopyData() = default; BatchCopyData(const BatchCopyData&) = delete; BatchCopyData(BatchCopyData&& other) : total_num_elements{std::exchange(other.total_num_elements, 0)} , d_input{std::exchange(other.d_input, nullptr)} , d_output{std::exchange(other.d_output, nullptr)} , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} BatchCopyData& operator=(BatchCopyData&& other) { total_num_elements = std::exchange(other.total_num_elements, 0); d_input = std::exchange(other.d_input, nullptr); d_output = std::exchange(other.d_output, nullptr); d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); return *this; }; BatchCopyData& operator=(const BatchCopyData&) = delete; size_t total_num_bytes() const { return total_num_elements * sizeof(ValueType); } ~BatchCopyData() { HIP_CHECK(hipFree(d_buffer_sizes)); HIP_CHECK(hipFree(d_buffer_srcs)); HIP_CHECK(hipFree(d_buffer_dsts)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_input)); } }; template BatchCopyData prepare_data(const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const bool shuffle_buffers = false; BatchCopyData result; const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; constexpr int32_t wlev_min_elems = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); constexpr int32_t blev_min_elems = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); constexpr int32_t max_elems = max_size / sizeof(ValueType); // Generate data std::mt19937_64 rng(std::random_device{}()); // Number of elements in each buffer. std::vector h_buffer_num_elements(num_buffers); auto iter = h_buffer_num_elements.begin(); iter = benchmark_utils::generate_random_data_n(iter, num_tlev_buffers, 1, wlev_min_elems - 1, rng); iter = benchmark_utils::generate_random_data_n(iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); iter = benchmark_utils::generate_random_data_n(iter, num_blev_buffers, blev_min_elems, max_elems, rng); // Shuffle the sizes so that size classes aren't clustered std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); result.total_num_elements = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); // Generate data. std::independent_bits_engine bits_engine{rng}; const size_t num_ints = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); std::for_each(reinterpret_cast(h_input.get()), reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); // Generate the source and shuffled destination offsets. std::vector src_offsets; std::vector dst_offsets; if(shuffle_buffers) { src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); } else { src_offsets = std::vector(num_buffers); dst_offsets = std::vector(num_buffers); // Consecutive offsets (no shuffling). // src/dst offsets first element is 0, so skip that! std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); } // Generate the source and destination pointers. std::vector h_buffer_srcs(num_buffers); std::vector h_buffer_dsts(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_srcs[i] = result.d_input + src_offsets[i]; h_buffer_dsts[i] = result.d_output + dst_offsets[i]; } // Prepare the batch copy. HIP_CHECK( hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), h_buffer_srcs.size() * sizeof(ValueType*), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), h_buffer_dsts.size() * sizeof(ValueType*), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_elements.data(), h_buffer_num_elements.size() * sizeof(BufferSizeType), hipMemcpyHostToDevice)); return result; } template void run_benchmark(benchmark::State& state, hipStream_t stream, const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; size_t temp_storage_bytes = 0; BatchCopyData data; HIP_CHECK(hipcub::DeviceCopy::Batched(nullptr, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers)); void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); data = prepare_data(num_tlev_buffers, num_wlev_buffers, num_blev_buffers); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(hipcub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); state.SetItemsProcessed(state.iterations() * data.total_num_elements); HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ std::string("device_batch_copy" \ ".") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_benchmark, T>(state, \ stream, \ num_tlev, \ num_wlev, \ num_blev); \ }) #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) int32_t main(int32_t argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", 1024, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int32_t trials = parser.get("trials"); // HIP hipStream_t stream = hipStreamDefault; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_device_batch_copy" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Benchmark info benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), BENCHMARK_TYPE(8, 8)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_batch_memcpy.cpp000066400000000000000000000430101502260333500242130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark/benchmark.h" #include "cmdparser.hpp" #include "common_benchmark_header.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/device/device_memcpy.hpp" #include "hipcub/hipcub.hpp" #ifdef __HIP_PLATFORM_AMD__ // Only include this on AMD as it contains specialized config information #include #endif #include #include #include #include #include #include #include constexpr uint32_t warmup_size = 5; constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = 128; constexpr int32_t blev_min_size = 1024; // Used for generating offsets. We generate a permutation map and then derive // offsets via a sum scan over the sizes in the order of the permutation. This // allows us to keep the order of buffers we pass to batch_memcpy, but still // have source and destinations mappings not be the identity function: // // batch_memcpy( // [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, // d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d // └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! // ───┬─── ─────┬───── ───┬─── ─── // └─────────┼─────────┼───┐ // ┌───┘ ┌───┘ │ what batch_memcpy does // ▼ ▼ ▼ // ─── ─────────── ─────── ─────── // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ template std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) { const auto n = input.size(); assert(n > 0); std::vector result(n); std::vector permute(n); std::iota(permute.begin(), permute.end(), 0); std::shuffle(permute.begin(), permute.end(), rng); for(T i = 0, sum = 0; i < n; ++i) { result[permute[i]] = sum; sum += input[permute[i]]; } return result; } using offset_type = size_t; template struct BatchMemcpyData { size_t total_num_elements = 0; ValueType* d_input = nullptr; ValueType* d_output = nullptr; ValueType** d_buffer_srcs = nullptr; ValueType** d_buffer_dsts = nullptr; BufferSizeType* d_buffer_sizes = nullptr; BatchMemcpyData() = default; BatchMemcpyData(const BatchMemcpyData&) = delete; BatchMemcpyData(BatchMemcpyData&& other) : total_num_elements{std::exchange(other.total_num_elements, 0)} , d_input{std::exchange(other.d_input, nullptr)} , d_output{std::exchange(other.d_output, nullptr)} , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} BatchMemcpyData& operator=(BatchMemcpyData&& other) { total_num_elements = std::exchange(other.total_num_elements, 0); d_input = std::exchange(other.d_input, nullptr); d_output = std::exchange(other.d_output, nullptr); d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); return *this; }; BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; size_t total_num_bytes() const { return total_num_elements * sizeof(ValueType); } ~BatchMemcpyData() { HIP_CHECK(hipFree(d_buffer_sizes)); HIP_CHECK(hipFree(d_buffer_srcs)); HIP_CHECK(hipFree(d_buffer_dsts)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_input)); } }; template BatchMemcpyData prepare_data(const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const bool shuffle_buffers = false; BatchMemcpyData result; const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; constexpr int32_t wlev_min_elems = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); constexpr int32_t blev_min_elems = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); constexpr int32_t max_elems = max_size / sizeof(ValueType); // Generate data std::mt19937_64 rng(std::random_device{}()); // Number of elements in each buffer. std::vector h_buffer_num_elements(num_buffers); auto iter = h_buffer_num_elements.begin(); iter = benchmark_utils::generate_random_data_n(iter, num_tlev_buffers, 1, wlev_min_elems - 1, rng); iter = benchmark_utils::generate_random_data_n(iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); iter = benchmark_utils::generate_random_data_n(iter, num_blev_buffers, blev_min_elems, max_elems, rng); // Shuffle the sizes so that size classes aren't clustered std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); // Get the byte size of each buffer std::vector h_buffer_num_bytes(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); } result.total_num_elements = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); // Generate data. std::independent_bits_engine bits_engine{rng}; const size_t num_ints = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); std::for_each(reinterpret_cast(h_input.get()), reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); // Generate the source and shuffled destination offsets. std::vector src_offsets; std::vector dst_offsets; if(shuffle_buffers) { src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); } else { src_offsets = std::vector(num_buffers); dst_offsets = std::vector(num_buffers); // Consecutive offsets (no shuffling). // src/dst offsets first element is 0, so skip that! std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); } // Generate the source and destination pointers. std::vector h_buffer_srcs(num_buffers); std::vector h_buffer_dsts(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_srcs[i] = result.d_input + src_offsets[i]; h_buffer_dsts[i] = result.d_output + dst_offsets[i]; } // Prepare the batch memcpy. HIP_CHECK( hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), h_buffer_srcs.size() * sizeof(ValueType*), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), h_buffer_dsts.size() * sizeof(ValueType*), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_bytes.data(), h_buffer_num_bytes.size() * sizeof(BufferSizeType), hipMemcpyHostToDevice)); return result; } template void run_benchmark(benchmark::State& state, hipStream_t stream, const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; size_t temp_storage_bytes = 0; BatchMemcpyData data; HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers)); void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); data = prepare_data(num_tlev_buffers, num_wlev_buffers, num_blev_buffers); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); state.SetItemsProcessed(state.iterations() * data.total_num_elements); HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ std::string("device_batch_memcpy.") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_benchmark, T>(state, \ stream, \ num_tlev, \ num_wlev, \ num_blev); \ }) #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) int32_t main(int32_t argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", 1024, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int32_t trials = parser.get("trials"); hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_device_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // HIP hipStream_t stream = hipStreamDefault; // default // Benchmark info benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), BENCHMARK_TYPE(8, 8)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_for.cpp000066400000000000000000000120741502260333500223540ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, // disable the warning because all warnings are threated as errors: #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_for.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template struct op_t { unsigned int* d_count; HIPCUB_DEVICE void operator()(T i) { // The data is non zero so atomic will never be activated. if(i == 0) { atomicAdd(d_count, 1); } } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using T = Value; // Generate data std::vector values_input(size, 4); T* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, values_input.data(), size * sizeof(T), hipMemcpyHostToDevice)); unsigned int* d_count; HIP_CHECK(hipMalloc(&d_count, sizeof(T))); HIP_CHECK(hipMemset(d_count, 0, sizeof(T))); op_t device_op{d_count}; // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::ForEach(d_input, d_input + size, device_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::ForEach(d_input, d_input + size, device_op, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_count)); HIP_CHECK(hipFree(d_input)); } #define CREATE_BENCHMARK(Value) \ benchmark::RegisterBenchmark(("for_each"), \ &run_benchmark, \ stream, \ size) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce_by_key" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARK(float), CREATE_BENCHMARK(double), CREATE_BENCHMARK(custom_double2), CREATE_BENCHMARK(int8_t), CREATE_BENCHMARK(float), CREATE_BENCHMARK(double), CREATE_BENCHMARK(long long), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_histogram.cpp000066400000000000000000000660631502260333500235720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_histogram.hpp" #include "hipcub/iterator/transform_input_iterator.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template std::vector generate(size_t size, int entropy_reduction, long long lower_level, long long upper_level) { if(entropy_reduction >= 5) { return std::vector(size, (lower_level + upper_level) / 2); } const size_t max_random_size = 1024 * 1024; std::random_device rd; std::default_random_engine gen(rd()); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { // Reduce entropy by applying bitwise AND to random bits // "An Improved Supercomputer Sorting Benchmark", 1992 // Kurt Thearling & Stephen Smith auto v = gen(); for(int e = 0; e < entropy_reduction; e++) { v &= gen(); } return T(lower_level + v % (upper_level - lower_level)); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } int get_entropy_percents(int entropy_reduction) { switch(entropy_reduction) { case 0: return 100; case 1: return 81; case 2: return 54; case 3: return 33; case 4: return 20; default: return 0; } } const int entropy_reductions[] = {0, 2, 4, 6}; template void run_even_benchmark(benchmark::State& state, size_t bins, size_t scale, int entropy_reduction, hipStream_t stream, size_t size) { using counter_type = unsigned int; const T lower_level = 0; // casting for compilation with CUB backend because // there is no casting from size_t (aka unsigned long) to __half const T upper_level = static_cast(bins * scale); // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); T* d_input; counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, lower_level, upper_level, int(size), stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, lower_level, upper_level, int(size), stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, lower_level, upper_level, int(size), stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_even_benchmark(benchmark::State& state, size_t bins, size_t scale, int entropy_reduction, hipStream_t stream, size_t size) { using counter_type = unsigned int; int num_levels[ActiveChannels]; int lower_level[ActiveChannels]; int upper_level[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { lower_level[channel] = 0; upper_level[channel] = bins * scale; num_levels[channel] = bins + 1; } // Generate data std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); T* d_input; counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, int(size), stream))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, int(size), stream))); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, int(size), stream))); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_histogram[channel])); } } template void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; // Generate data std::vector input = benchmark_utils::get_random_data(size, 0, bins); std::vector levels(bins + 1); std::iota(levels.begin(), levels.end(), static_cast(0)); T* d_input; T* d_levels; counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, d_levels, int(size), stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, d_levels, int(size), stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, d_levels, int(size), stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_levels)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; // Number of levels for a single channel const int num_levels_channel = bins + 1; int num_levels[ActiveChannels]; std::vector levels[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { levels[channel].resize(num_levels_channel); std::iota(levels[channel].begin(), levels[channel].end(), static_cast(0)); num_levels[channel] = num_levels_channel; } // Generate data std::vector input = benchmark_utils::get_random_data(size * Channels, 0, bins); T* d_input; T* d_levels[ActiveChannels]; counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); } HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMemcpy(d_levels[channel], levels[channel].data(), num_levels_channel * sizeof(T), hipMemcpyHostToDevice)); } void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, d_levels, int(size), stream))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, d_levels, int(size), stream))); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, d_levels, int(size), stream))); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_levels[channel])); HIP_CHECK(hipFree(d_histogram[channel])); } } template struct num_limits { static constexpr T max() { return std::numeric_limits::max(); }; }; template<> struct num_limits<__half> { static constexpr double max() { return 65504.0; }; }; #define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ if(num_limits::max() > BINS * SCALE) \ { \ VECTOR.push_back(benchmark::RegisterBenchmark( \ std::string("device_histogram_even" \ "." \ "(entropy_percent:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + "%,bin_count:" + std::to_string(BINS) + " bins)") \ .c_str(), \ [=](benchmark::State& state) \ { run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); })); \ } #define BENCHMARK_TYPE(VECTOR, T) \ CREATE_EVEN_BENCHMARK(VECTOR, T, 10, 1234); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 100, 1234); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, 1234); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 16, 10); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 256, 10); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) void add_even_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { for(int entropy_reduction : entropy_reductions) { BENCHMARK_TYPE(benchmarks, long long); BENCHMARK_TYPE(benchmarks, int); BENCHMARK_TYPE(benchmarks, unsigned short); BENCHMARK_TYPE(benchmarks, uint8_t); BENCHMARK_TYPE(benchmarks, double); BENCHMARK_TYPE(benchmarks, float); // this limitation can be removed once // https://github.com/NVIDIA/cub/issues/484 is fixed #ifdef __HIP_PLATFORM_AMD__ BENCHMARK_TYPE(benchmarks, __half); #endif }; } #define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ benchmark::RegisterBenchmark( \ std::string("device_multi_histogram_even" \ "." \ "(entropy_percent:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + "%,bin_count:" + std::to_string(BINS) + " bins)") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_multi_even_benchmark(state, \ BINS, \ SCALE, \ entropy_reduction, \ stream, \ size); \ }) void add_multi_even_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { for(int entropy_reduction : entropy_reductions) { std::vector bs = { CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 10), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 16, 10), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 256, 10), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 65536, 1), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); }; } #define CREATE_RANGE_BENCHMARK(T, BINS) \ benchmark::RegisterBenchmark(std::string("device_histogram_range" \ "." \ "(bin_count:" \ + std::to_string(BINS) + " bins)") \ .c_str(), \ [=](benchmark::State& state) \ { run_range_benchmark(state, BINS, stream, size); }) #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000), \ CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) void add_range_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = {BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ benchmark::RegisterBenchmark( \ std::string("device_multi_histogram_range" \ ".(bin_count:" \ + std::to_string(BINS) + " bins)") \ .c_str(), \ [=](benchmark::State& state) \ { run_multi_range_benchmark(state, BINS, stream, size); }) void add_multi_range_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10000), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100000), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000000), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_histogram" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_even_benchmarks(benchmarks, stream, size); add_multi_even_benchmarks(benchmarks, stream, size); add_range_benchmarks(benchmarks, stream, size); add_multi_range_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_memory.cpp000066400000000000000000000414031502260333500230740ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_scan.hpp" #include "hipcub/block/block_store.hpp" enum memory_operation_method { direct, striped, vectorize, transpose, warp_transpose }; enum kernel_operation { no_operation, block_scan, custom_operation, atomics_no_collision, atomics_inter_block_collision, atomics_inter_warp_collision, }; struct empty_storage_type {}; template struct operation; // no operation template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& /*storage*/, T (&)[ItemsPerThread], T* = nullptr) const {} }; // custom operation template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) const { (void)storage; (void)global_mem_output; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; constexpr unsigned int repeats = 30; #pragma unroll for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); } } } }; // block scan template struct operation { typedef typename hipcub::BlockScan block_scan_type; typedef typename block_scan_type::TempStorage storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)global_mem_output; // sync before re-using shared memory from load __syncthreads(); block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); } }; // atomics_no_collision template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)input; const unsigned int index = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)input; const unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)input; const unsigned int index = threadIdx.x * ItemsPerThread; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; template struct memory_operation {}; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; }; template __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { typedef memory_operation mem_op; typedef hipcub::BlockLoad load_type; typedef hipcub::BlockStore store_type; __shared__ union { typename load_type::TempStorage load; typename store_type::TempStorage store; typename CustomOp::storage_type operand; } storage; constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load_type(storage.load).Load(input + offset, items); op(storage.operand, items, output); // sync before re-using shared memory from load or from operand __syncthreads(); store_type(storage.store).Store(output + offset, items); } template void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream) { const size_t grid_size = size / (BlockSize * ItemsPerThread); std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); operation selected_operation; // Warm-up for(size_t i = 0; i < 10; i++) { hipLaunchKernelGGL(HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) { // Allocate device buffers // Note: since this benchmark only tests memcpy performance between device // buffers, we don't really need to copy data into these from the host - // whatever happens to be in memory will suffice. T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_memory.") \ .c_str(), \ [=](benchmark::State& state) \ { run_benchmark(state, SIZE, stream); })); #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_memory_memcpy.").c_str(), \ [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 1) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 2) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 4) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 8) #define CREATE_BENCHMARK_MEM_OP(MEM_OP, OP, TYPE, SIZE) \ CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, 256) #define CREATE_BENCHMARK(OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(direct, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(striped, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(vectorize, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(transpose, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(warp_transpose, OP, TYPE, SIZE) // clang-format on template constexpr unsigned int megabytes(unsigned int size) { return (size * (1024 * 1024 / sizeof(T))); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const int trials = parser.get("trials"); std::cout << "benchmark_device_memory" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // Simple memory copy from device to device, not running a kernel CREATE_BENCHMARK_MEMCPY(int, megabytes(128)) // clang-format off CREATE_BENCHMARK(no_operation, int, megabytes(128)) CREATE_BENCHMARK(block_scan, int, megabytes(128)) CREATE_BENCHMARK(custom_operation, int, megabytes(128)) CREATE_BENCHMARK(atomics_no_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_block_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_warp_collision, int, megabytes(128)) // clang-format on // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_merge_sort.cpp000066400000000000000000000317121502260333500237340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_merge_sort.hpp" #include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 32 << 20; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template struct CompareFunction { HIPCUB_DEVICE inline constexpr bool operator()(const key_type& a, const key_type& b) { return a < b; } }; template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; CompareFunction compare_function; std::vector keys_input = benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, compare_function, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, compare_function, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, compare_function, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; using value_type = Value; CompareFunction compare_function; std::vector keys_input = benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, size, compare_function, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, size, compare_function, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, size, compare_function, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(T) \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_merge_sort_sort_keys" \ ".") \ .c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); })); #define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_merge_sort_sort_pairs<" \ ",key_data_type:" #T ",value_data_type:" #V ">.") \ .c_str(), \ [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); })); void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { CREATE_SORT_KEYS_BENCHMARK(int) CREATE_SORT_KEYS_BENCHMARK(long long) CREATE_SORT_KEYS_BENCHMARK(int8_t) CREATE_SORT_KEYS_BENCHMARK(uint8_t) CREATE_SORT_KEYS_BENCHMARK(short) } void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; using custom_char_double = benchmark_utils::custom_type; using custom_double_char = benchmark_utils::custom_type; CREATE_SORT_PAIRS_BENCHMARK(int, float) CREATE_SORT_PAIRS_BENCHMARK(int, double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, float) CREATE_SORT_PAIRS_BENCHMARK(long long, double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_device_merge_sort" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_partition.cpp000066400000000000000000000463531502260333500236060ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_partition.hpp" #include #include #ifndef DEFAULT_N constexpr size_t DEFAULT_N = 1024 * 1024 * 32; #endif constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; namespace { template struct LessOp { HIPCUB_HOST_DEVICE LessOp(const T& pivot) : pivot_{pivot} {} HIPCUB_HOST_DEVICE bool operator()(const T& val) const { return val < pivot_; } private: T pivot_; }; } // namespace template void run_flagged(benchmark::State& state, const hipStream_t stream, const T threshold, const size_t size) { const auto select_op = LessOp{threshold}; const auto input = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); std::vector flags(size); for(unsigned int i = 0; i < size; i++) { flags[i] = static_cast(select_op(input[i])); } T* d_input = nullptr; F* d_flags = nullptr; T* d_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; HIP_CHECK(hipcub::DevicePartition::Flagged(nullptr, temp_storage_bytes, d_input, d_flags, d_output, d_num_selected_output, static_cast(input.size()), stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); for(unsigned int i = 0; i < warmup_size; ++i) { HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, d_num_selected_output, static_cast(input.size()), stream)); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark for(auto _ : state) { namespace chrono = std::chrono; using clock = chrono::high_resolution_clock; const auto start = clock::now(); for(unsigned int i = 0; i < batch_size; ++i) { HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, d_num_selected_output, static_cast(input.size()), stream)); } HIP_CHECK(hipDeviceSynchronize()); const auto end = clock::now(); using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetItemsProcessed(state.iterations() * batch_size * input.size()); state.SetBytesProcessed( static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_num_selected_output)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_flags)); HIP_CHECK(hipFree(d_input)); } template void run_predicate(benchmark::State& state, const hipStream_t stream, const T threshold, const size_t size) { const auto input = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); T* d_input = nullptr; T* d_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); const auto select_op = LessOp{threshold}; // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; HIP_CHECK(hipcub::DevicePartition::If(nullptr, temp_storage_bytes, d_input, d_output, d_num_selected_output, static_cast(input.size()), select_op, stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int i = 0; i < warmup_size; ++i) { HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, temp_storage_bytes, d_input, d_output, d_num_selected_output, static_cast(input.size()), select_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark for(auto _ : state) { namespace chrono = std::chrono; using clock = chrono::high_resolution_clock; const auto start = clock::now(); for(unsigned int i = 0; i < batch_size; ++i) { HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, temp_storage_bytes, d_input, d_output, d_num_selected_output, static_cast(input.size()), select_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); const auto end = clock::now(); using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetItemsProcessed(state.iterations() * batch_size * input.size()); state.SetBytesProcessed( static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_num_selected_output)); } template void run_threeway(benchmark::State& state, const hipStream_t stream, const T small_threshold, const T large_threshold, const size_t size) { const auto input = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); T* d_input = nullptr; T* d_first_output = nullptr; T* d_second_output = nullptr; T* d_unselected_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_second_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_unselected_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, 2 * sizeof(unsigned int))); const auto select_first_part_op = LessOp{small_threshold}; const auto select_second_part_op = LessOp{large_threshold}; // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; HIP_CHECK(hipcub::DevicePartition::If(nullptr, temp_storage_bytes, d_input, d_first_output, d_second_output, d_unselected_output, d_num_selected_output, static_cast(input.size()), select_first_part_op, select_second_part_op, stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int i = 0; i < warmup_size; ++i) { HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, temp_storage_bytes, d_input, d_first_output, d_second_output, d_unselected_output, d_num_selected_output, static_cast(input.size()), select_first_part_op, select_second_part_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark for(auto _ : state) { namespace chrono = std::chrono; using clock = chrono::high_resolution_clock; const auto start = clock::now(); for(unsigned int i = 0; i < batch_size; ++i) { HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, temp_storage_bytes, d_input, d_first_output, d_second_output, d_unselected_output, d_num_selected_output, static_cast(input.size()), select_first_part_op, select_second_part_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); const auto end = clock::now(); using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetItemsProcessed(state.iterations() * batch_size * input.size()); state.SetBytesProcessed( static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_first_output)); HIP_CHECK(hipFree(d_second_output)); HIP_CHECK(hipFree(d_unselected_output)); HIP_CHECK(hipFree(d_num_selected_output)); } #define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ benchmark::RegisterBenchmark(std::string("device_parition_flagged.(split_threshold:" #SPLIT_T \ "%)") \ .c_str(), \ &run_flagged, \ stream, \ static_cast(SPLIT_T), \ size) #define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ benchmark::RegisterBenchmark( \ std::string("device_parition_predicate.(split_threshold:" #SPLIT_T "%)") \ .c_str(), \ &run_predicate, \ stream, \ static_cast(SPLIT_T), \ size) #define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ benchmark::RegisterBenchmark(std::string("device_parition_three_way" \ ".(small_threshold:" #SMALL_T \ "%,large_threshold:" #LARGE_T "%)") \ .c_str(), \ &run_threeway, \ stream, \ static_cast(SMALL_T), \ static_cast(LARGE_T), \ size) #define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) #define BENCHMARK_PREDICATE_TYPE(type) \ CREATE_BENCHMARK_PREDICATE(type, 33), CREATE_BENCHMARK_PREDICATE(type, 50), \ CREATE_BENCHMARK_PREDICATE(type, 60), CREATE_BENCHMARK_PREDICATE(type, 90) #define BENCHMARK_THREEWAY_TYPE(type) \ CREATE_BENCHMARK_THREEWAY(type, 33, 66), CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ CREATE_BENCHMARK_THREEWAY(type, 50, 60), CREATE_BENCHMARK_THREEWAY(type, 50, 90) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_partition" << std::endl; // HIP const hipStream_t stream = 0; // default { hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; } using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = { BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), BENCHMARK_FLAGGED_TYPE(long long, uint8_t), BENCHMARK_FLAGGED_TYPE(double, int8_t), BENCHMARK_FLAGGED_TYPE(custom_float2, int8_t), BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), BENCHMARK_PREDICATE_TYPE(int8_t), BENCHMARK_PREDICATE_TYPE(int), BENCHMARK_PREDICATE_TYPE(float), BENCHMARK_PREDICATE_TYPE(long long), BENCHMARK_PREDICATE_TYPE(double), BENCHMARK_PREDICATE_TYPE(custom_float2), BENCHMARK_PREDICATE_TYPE(custom_double2), BENCHMARK_THREEWAY_TYPE(int8_t), BENCHMARK_THREEWAY_TYPE(int), BENCHMARK_THREEWAY_TYPE(float), BENCHMARK_THREEWAY_TYPE(long long), BENCHMARK_THREEWAY_TYPE(double), BENCHMARK_THREEWAY_TYPE(custom_float2), BENCHMARK_THREEWAY_TYPE(custom_double2), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_radix_sort.cpp000066400000000000000000000537501502260333500237520ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include #include // HIP API #include "hipcub/device/device_radix_sort.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template std::vector generate_keys(size_t size) { using key_type = Key; return benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); } template auto invoke_sort_keys(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(Key) * 8, stream); } template auto invoke_sort_keys(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(Key) * 8, stream); } template auto invoke_sort_keys(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, benchmark_utils::custom_type_decomposer{}, stream); } template auto invoke_sort_keys(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, size, benchmark_utils::custom_type_decomposer{}, stream); } template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size, std::shared_ptr> keys_input) { using key_type = Key; key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input->data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(invoke_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(invoke_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template auto invoke_sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, Value* d_values_input, Value* d_values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(Key) * 8, stream); } template auto invoke_sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, Value* d_values_input, Value* d_values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(Key) * 8, stream); } template auto invoke_sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, Value* d_values_input, Value* d_values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, benchmark_utils::custom_type_decomposer{}, stream); } template auto invoke_sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, Key* d_keys_input, Key* d_keys_output, Value* d_values_input, Value* d_values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value, hipError_t> { return hipcub::DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, benchmark_utils::custom_type_decomposer{}, stream); } template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size, std::shared_ptr> keys_input) { using key_type = Key; using value_type = Value; std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input->data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(invoke_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(invoke_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key) \ { \ auto keys_input = std::make_shared>(generate_keys(size)); \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_radix_sort_keys_ascending" \ ".") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, stream, size, keys_input); })); \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_radix_sort_keys_descending" \ ".") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, stream, size, keys_input); })); \ } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ { \ auto keys_input = std::make_shared>(generate_keys(size)); \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_radix_sort_pairs_ascending" \ ".") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, stream, size, keys_input); })); \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("device_radix_sort_pairs_descending" \ ".") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, stream, size, keys_input); })); \ } void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_int_t = benchmark_utils::custom_type; CREATE_SORT_KEYS_BENCHMARK(int) CREATE_SORT_KEYS_BENCHMARK(long long) CREATE_SORT_KEYS_BENCHMARK(int8_t) CREATE_SORT_KEYS_BENCHMARK(uint8_t) CREATE_SORT_KEYS_BENCHMARK(short) CREATE_SORT_KEYS_BENCHMARK(custom_int_t) } void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; using custom_char_double = benchmark_utils::custom_type; using custom_double_char = benchmark_utils::custom_type; using custom_int_t = benchmark_utils::custom_type; CREATE_SORT_PAIRS_BENCHMARK(int, float) CREATE_SORT_PAIRS_BENCHMARK(int, double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, float) CREATE_SORT_PAIRS_BENCHMARK(long long, double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float) } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_radix_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_reduce.cpp000066400000000000000000000160331502260333500230340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, ReduceKernel reduce) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); T* d_input; OutputT* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } template struct Benchmark; template struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, hipStream_t) = &hipcub::DeviceReduce::Sum; run_benchmark(state, size, stream, ptr_to_sum); } }; template struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, hipStream_t) = &hipcub::DeviceReduce::Min; run_benchmark(state, size, stream, ptr_to_min); } }; template struct Benchmark { using Difference = int; using Iterator = typename hipcub::ArgIndexInputIterator; using KeyValue = typename Iterator::value_type; static void run(benchmark::State& state, size_t size, const hipStream_t stream) { hipError_t (*ptr_to_argmin)(void*, size_t&, T*, KeyValue*, int, hipStream_t) = &hipcub::DeviceReduce::ArgMin; run_benchmark(state, size, stream, ptr_to_argmin); } }; #define CREATE_BENCHMARK(T, REDUCE_OP) \ benchmark::RegisterBenchmark(std::string("device_reduce" \ ".") \ .c_str(), \ &Benchmark::run, \ size, \ stream) #define CREATE_BENCHMARKS(REDUCE_OP) \ CREATE_BENCHMARK(int, REDUCE_OP), CREATE_BENCHMARK(long long, REDUCE_OP), \ CREATE_BENCHMARK(float, REDUCE_OP), CREATE_BENCHMARK(double, REDUCE_OP), \ CREATE_BENCHMARK(int8_t, REDUCE_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, hipcub::Min), #endif CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), #endif }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_reduce_by_key.cpp000066400000000000000000000244451502260333500244040ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of single_pass_scan_operators has maybe uninitialized // parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size, BinaryFunction reduce_op) { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input(size); unsigned int unique_count = 0; std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[unique_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = unique_count; } unique_count++; offset += key_count; } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); key_type* d_unique_output; value_type* d_aggregates_output; unsigned int* d_unique_count_output; HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(nullptr, temporary_storage_bytes, d_keys_input, d_unique_output, d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_unique_output, d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_unique_output, d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, size, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_aggregates_output)); HIP_CHECK(hipFree(d_unique_count_output)); } #define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ benchmark::RegisterBenchmark(std::string("device_reduce_by_key" \ "." \ "(random_number_range:[1, " \ + std::to_string(max_length) + "])") \ .c_str(), \ &run_benchmark, \ max_length, \ stream, \ size, \ REDUCE_OP()) #define CREATE_BENCHMARKS(REDUCE_OP) \ CREATE_BENCHMARK(int, float, REDUCE_OP), CREATE_BENCHMARK(int, double, REDUCE_OP), \ CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ CREATE_BENCHMARK(long long, float, REDUCE_OP), \ CREATE_BENCHMARK(long long, double, REDUCE_OP) void add_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), #endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce_by_key" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(1000, benchmarks, stream, size); add_benchmarks(10, benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_run_length_encode.cpp000066400000000000000000000356551502260333500252620ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_run_length_encode.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = T; using count_type = unsigned int; // Generate data std::vector input(size); unsigned int runs_count = 0; std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; } runs_count++; offset += key_count; } key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); key_type* d_unique_output; count_type* d_counts_output; count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(nullptr, temporary_storage_bytes, d_input, d_unique_output, d_counts_output, d_runs_count_output, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, temporary_storage_bytes, d_input, d_unique_output, d_counts_output, d_runs_count_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, temporary_storage_bytes, d_input, d_unique_output, d_counts_output, d_runs_count_output, size, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } template void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = T; using offset_type = unsigned int; using count_type = unsigned int; // Generate data std::vector input(size); unsigned int runs_count = 0; std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; } runs_count++; offset += key_count; } key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); offset_type* d_offsets_output; count_type* d_counts_output; count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(nullptr, temporary_storage_bytes, d_input, d_offsets_output, d_counts_output, d_runs_count_output, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, temporary_storage_bytes, d_input, d_offsets_output, d_counts_output, d_runs_count_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, temporary_storage_bytes, d_input, d_offsets_output, d_counts_output, d_runs_count_output, size, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_offsets_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } #define CREATE_ENCODE_BENCHMARK(T) \ benchmark::RegisterBenchmark(std::string("device_run_length_encode" \ "." \ "(random_number_range:[1, " \ + std::to_string(max_length) + "])") \ .c_str(), \ &run_encode_benchmark, \ max_length, \ stream, \ size) void add_encode_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_ENCODE_BENCHMARK(int), CREATE_ENCODE_BENCHMARK(long long), CREATE_ENCODE_BENCHMARK(int8_t), CREATE_ENCODE_BENCHMARK(uint8_t), CREATE_ENCODE_BENCHMARK(custom_float2), CREATE_ENCODE_BENCHMARK(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ benchmark::RegisterBenchmark(std::string("run_length_encode_non_trivial_runs" \ "" \ "(random_number_range:[1, " \ + std::to_string(max_length) + "])") \ .c_str(), \ &run_non_trivial_runs_benchmark, \ max_length, \ stream, \ size) void add_non_trivial_runs_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_run_length_encode" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_encode_benchmarks(1000, benchmarks, stream, size); add_encode_benchmarks(10, benchmarks, stream, size); add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_scan.cpp000066400000000000000000000424461502260333500225200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE // CUB's implementation of single_pass_scan_operators has maybe uninitialized // parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_scan.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream) -> typename std::enable_if::type { return hipcub::DeviceScan::ExclusiveScan(temporary_storage, storage_size, input, output, scan_op, initial_value, input_size, stream); } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream) -> typename std::enable_if::type { (void)initial_value; return hipcub::DeviceScan::InclusiveScan(temporary_storage, storage_size, input, output, scan_op, input_size, stream); } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, K* keys, T* input, T* output, const T initial_value, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream) -> typename std::enable_if::type { return hipcub::DeviceScan::ExclusiveScanByKey(temporary_storage, storage_size, keys, input, output, scan_op, initial_value, static_cast(input_size), hipcub::Equality(), stream); } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, K* keys, T* input, T* output, const T /*initial_value*/, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream) -> typename std::enable_if::type { return hipcub::DeviceScan::InclusiveScanByKey(temporary_storage, storage_size, keys, input, output, scan_op, static_cast(input_size), hipcub::Equality(), stream); } template void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, BinaryFunction scan_op) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); T initial_value = T(123); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } template void run_benchmark_by_key(benchmark::State& state, size_t size, const hipStream_t stream, BinaryFunction scan_op) { using key_type = int; constexpr size_t max_segment_length = 100; const std::vector keys = benchmark_utils::get_random_segments(size, max_segment_length, std::random_device{}()); const std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); const T initial_value = T(123); key_type* d_keys; T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_keys, keys.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, stream))); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, stream))); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, stream))); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ benchmark::RegisterBenchmark( \ std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \ + ".") \ .c_str(), \ &run_benchmark, \ size, \ stream, \ SCAN_OP()), \ benchmark::RegisterBenchmark( \ std::string(std::string(EXCL ? "device_exclusive_scan_by_key" \ : "device_inclusive_scan_by_key") \ + ".") \ .c_str(), \ &run_benchmark_by_key, \ size, \ stream, \ SCAN_OP()) #define CREATE_BENCHMARKS(SCAN_OP) \ CREATE_BENCHMARK(false, int, SCAN_OP), CREATE_BENCHMARK(true, int, SCAN_OP), \ CREATE_BENCHMARK(false, float, SCAN_OP), CREATE_BENCHMARK(true, float, SCAN_OP), \ CREATE_BENCHMARK(false, double, SCAN_OP), CREATE_BENCHMARK(true, double, SCAN_OP), \ CREATE_BENCHMARK(false, long long, SCAN_OP), CREATE_BENCHMARK(true, long long, SCAN_OP), \ CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ CREATE_BENCHMARK(true, custom_double2, SCAN_OP), CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ CREATE_BENCHMARK(true, int8_t, SCAN_OP), CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ CREATE_BENCHMARK(true, uint8_t, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_scan" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; using custom_float2 = benchmark_utils::custom_type; // Compilation may never finish, if the compiler needs to compile too many // kernels, it is recommended to compile benchmarks only for 1-2 types when // BENCHMARK_CONFIG_TUNING is used (all other CREATE_*_BENCHMARK should be // commented/removed). // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort.cpp000066400000000000000000000467771502260333500260200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 4; const unsigned int warmup_size = 2; constexpr bool Ascending = false; constexpr bool Descending = true; template void run_sort_keys_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, bool descending = false) { using offset_type = int; using key_type = Key; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, key_type*, int, int, offset_type*, offset_type*, int, int, hipStream_t); sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys; sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending; sort_func sorting = descending ? func_descending : func_ascending; // Generate data std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector keys_input = benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, bool descending = false) { using offset_type = int; using key_type = Key; using value_type = Value; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, key_type*, const value_type*, value_type*, int, int, offset_type*, offset_type*, int, int, hipStream_t); sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortPairs; sort_func func_descending = &hipcub::DeviceSegmentedRadixSort:: SortPairsDescending; sort_func sorting = descending ? func_descending : func_ascending; // Generate data std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector keys_input = benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_keys" \ "." \ "(segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); }) #define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_keys" \ "." \ "(segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); }) #define BENCHMARK_KEY_TYPE(type) \ CREATE_SORT_KEYS_BENCHMARK(type, 1), CREATE_SORT_KEYS_BENCHMARK(type, 10), \ CREATE_SORT_KEYS_BENCHMARK(type, 100), CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ CREATE_SORT_KEYS_BENCHMARK(type, 10000), CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), BENCHMARK_KEY_TYPE(int), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_pairs" \ "." \ "(segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); }) #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_radix_sort_pairs" \ "." \ "(segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); }) #define BENCHMARK_PAIR_TYPE(type, value) \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_PAIR_TYPE(int, float), BENCHMARK_PAIR_TYPE(long long, double), BENCHMARK_PAIR_TYPE(int8_t, int8_t), BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), BENCHMARK_PAIR_TYPE(int, custom_float2), BENCHMARK_PAIR_TYPE(long long, custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_radix_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_segmented_reduce.cpp000066400000000000000000000250141502260333500250660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_segmented_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; using OffsetType = int; template void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, SegmentedReduceKernel segmented_reduce) { using value_type = T; // Generate data const unsigned int seed = 123; std::default_random_engine gen(seed); const double avg_segment_length = static_cast(size) / desired_segments; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); OffsetType* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), (segments_count + 1) * sizeof(OffsetType), hipMemcpyHostToDevice)); value_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); OutputT* d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(segmented_reduce(d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_aggregates_output)); } template struct Benchmark; template struct Benchmark { static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) = &hipcub::DeviceSegmentedReduce::Sum; run_benchmark(state, desired_segments, stream, size, ptr_to_sum); } }; template struct Benchmark { static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) = &hipcub::DeviceSegmentedReduce::Min; run_benchmark(state, desired_segments, stream, size, ptr_to_min); } }; template struct Benchmark { using Difference = OffsetType; using Iterator = typename hipcub::ArgIndexInputIterator; using KeyValue = typename Iterator::value_type; static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { hipError_t (*ptr_to_argmin)(void*, size_t&, T*, KeyValue*, int, OffsetType*, OffsetType*, hipStream_t) = &hipcub::DeviceSegmentedReduce::ArgMin; run_benchmark(state, desired_segments, stream, size, ptr_to_argmin); } }; #define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ benchmark::RegisterBenchmark(std::string("device_segmented_reduce" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ &Benchmark::run, \ SEGMENTS, \ stream, \ size) #define BENCHMARK_TYPE(type, REDUCE_OP) \ CREATE_BENCHMARK(type, 1, REDUCE_OP), CREATE_BENCHMARK(type, 100, REDUCE_OP), \ CREATE_BENCHMARK(type, 10000, REDUCE_OP) #define CREATE_BENCHMARKS(REDUCE_OP) \ BENCHMARK_TYPE(float, REDUCE_OP), BENCHMARK_TYPE(double, REDUCE_OP), \ BENCHMARK_TYPE(int8_t, REDUCE_OP), BENCHMARK_TYPE(int, REDUCE_OP) void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), BENCHMARK_TYPE(custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, hipcub::Min), #endif CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), #endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_segmented_sort.cpp000066400000000000000000000556271502260333500246230ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 4; const unsigned int warmup_size = 2; template void run_sort_keys_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, bool Descending = false, bool Stable = false) { using offset_type = int; using key_type = Key; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, key_type*, int, int, offset_type*, offset_type*, hipStream_t); sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys; sort_func func_descending = &hipcub::DeviceSegmentedSort::SortKeysDescending; sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortKeys; sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortKeysDescending; sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) : (Stable ? func_ascending_stable : func_ascending); std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); ++segments_count; offset += segment_length; } offsets.push_back(size); std::vector keys_input = benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; ++i) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; ++i) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, bool Descending = false, bool Stable = false) { using offset_type = int; using key_type = Key; using value_type = Value; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, key_type*, const value_type*, value_type*, int, int, offset_type*, offset_type*, hipStream_t); sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortPairs; sort_func func_descending = &hipcub::DeviceSegmentedSort::SortPairsDescending; sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortPairs; sort_func func_descending_stable = &hipcub::DeviceSegmentedSort:: StableSortPairsDescending; sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) : (Stable ? func_ascending_stable : func_ascending); std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); ++segments_count; offset += segment_length; } offsets.push_back(size); std::vector keys_input = benchmark_utils::get_random_data( size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark(std::string("device_segmented_sort_keys" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) { \ run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ }), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_keys" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) #define BENCHMARK_KEY_TYPE(type) \ CREATE_SORT_KEYS_BENCHMARK(type, 10), CREATE_SORT_KEYS_BENCHMARK(type, 100), \ CREATE_SORT_KEYS_BENCHMARK(type, 1000), CREATE_SORT_KEYS_BENCHMARK(type, 10000) void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), BENCHMARK_KEY_TYPE(int), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_pairs" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_pairs" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_pairs" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) { \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); \ }), \ benchmark::RegisterBenchmark( \ std::string("device_segmented_sort_pairs" \ "." \ "(number_of_segments:~" \ + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State& state) \ { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) #define BENCHMARK_PAIR_TYPE(type, value) \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_PAIR_TYPE(int, float), BENCHMARK_PAIR_TYPE(long long, double), BENCHMARK_PAIR_TYPE(int8_t, int8_t), BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), BENCHMARK_PAIR_TYPE(int, custom_float2), BENCHMARK_PAIR_TYPE(long long, custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_select.cpp000066400000000000000000000731621502260333500230520ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_select.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_flagged_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); std::vector flags = benchmark_utils::get_random_data01(size, true_probability); T* d_input; FlagType* d_flags; T* d_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSelect::Flagged(nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_flags)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipDeviceSynchronize()); } template struct SelectOperator { float true_probability; SelectOperator(float true_probability_) : true_probability(true_probability_) {} HIPCUB_DEVICE inline constexpr bool operator()(const T& value) { return value < T(1000 * true_probability); } }; template void run_selectop_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); SelectOperator select_op(true_probability); T* d_input; T* d_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSelect::If(nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipDeviceSynchronize()); } template void run_flagged_if_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); std::vector flags = benchmark_utils::get_random_data01(size, true_probability); SelectOperator select_flag_op(true_probability); T* d_input; FlagType* d_flags; T* d_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), select_flag_op, stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), select_flag_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), select_flag_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_flags); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); HIP_CHECK(hipDeviceSynchronize()); } template void run_unique_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float discontinuity_probability) { hipcub::Sum op; std::vector input(size); { auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); auto acc = input01[0]; input[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input[i] = op(acc, input01[i]); } } T* d_input; T* d_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSelect::Unique(nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } template void run_unique_by_key_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float discontinuity_probability) { hipcub::Sum op; std::vector input_keys(size); { auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); auto acc = input01[0]; input_keys[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input_keys[i] = op(acc, input01[i]); } } const auto input_values = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); KeyT* d_keys_input; ValueT* d_values_input; KeyT* d_keys_output; ValueT* d_values_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_input, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); HIP_CHECK(hipMemcpy(d_keys_input, input_keys.data(), input_keys.size() * sizeof(input_keys[0]), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_values_input, input_values.data(), input_values.size() * sizeof(input_values[0]), hipMemcpyHostToDevice)); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(nullptr, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(KeyT) + sizeof(ValueT))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ std::string("device_select_flagged.(probability:" #p ")") \ .c_str(), \ &run_flagged_benchmark, \ size, \ stream, \ p) #define CREATE_SELECT_IF_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ std::string("device_select_if.(probability:" #p ")") \ .c_str(), \ &run_selectop_benchmark, \ size, \ stream, \ p) #define CREATE_SELECT_FLAGGED_IF_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ std::string("device_select_flagged_if.(probability:" #p ")") \ .c_str(), \ &run_flagged_if_benchmark, \ size, \ stream, \ p) #define CREATE_UNIQUE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ std::string("device_select_unique.(probability:" #p ")") \ .c_str(), \ &run_unique_benchmark, \ size, \ stream, \ p) #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark( \ std::string("device_select_unique_by_key.(probability:" #p ")") \ .c_str(), \ &run_unique_by_key_benchmark, \ size, \ stream, \ p) #define BENCHMARK_FLAGGED_TYPE(type, value) \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) #define BENCHMARK_IF_TYPE(type) \ CREATE_SELECT_IF_BENCHMARK(type, 0.05f), CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.5f), CREATE_SELECT_IF_BENCHMARK(type, 0.75f) #define BENCHMARK_FLAGGED_IF_TYPE(type, value) \ CREATE_SELECT_FLAGGED_IF_BENCHMARK(type, value, 0.05f), \ CREATE_SELECT_FLAGGED_IF_BENCHMARK(type, value, 0.25f), \ CREATE_SELECT_FLAGGED_IF_BENCHMARK(type, value, 0.5f), \ CREATE_SELECT_FLAGGED_IF_BENCHMARK(type, value, 0.75f) #define BENCHMARK_UNIQUE_TYPE(type) \ CREATE_UNIQUE_BENCHMARK(type, 0.05f), CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ CREATE_UNIQUE_BENCHMARK(type, 0.5f), CREATE_UNIQUE_BENCHMARK(type, 0.75f) #define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_select" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = {BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), BENCHMARK_FLAGGED_TYPE(double, unsigned char), BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), BENCHMARK_IF_TYPE(int), BENCHMARK_IF_TYPE(float), BENCHMARK_IF_TYPE(double), BENCHMARK_IF_TYPE(uint8_t), BENCHMARK_IF_TYPE(int8_t), BENCHMARK_IF_TYPE(custom_int_double), BENCHMARK_FLAGGED_IF_TYPE(int, unsigned char), BENCHMARK_FLAGGED_IF_TYPE(float, unsigned char), BENCHMARK_FLAGGED_IF_TYPE(double, unsigned char), BENCHMARK_FLAGGED_IF_TYPE(uint8_t, uint8_t), BENCHMARK_FLAGGED_IF_TYPE(int8_t, int8_t), BENCHMARK_FLAGGED_IF_TYPE(custom_double2, unsigned char), BENCHMARK_UNIQUE_TYPE(int), BENCHMARK_UNIQUE_TYPE(float), BENCHMARK_UNIQUE_TYPE(double), BENCHMARK_UNIQUE_TYPE(uint8_t), BENCHMARK_UNIQUE_TYPE(int8_t), BENCHMARK_UNIQUE_TYPE(custom_int_double), BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_device_spmv.cpp000066400000000000000000000245251502260333500225570ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_spmv.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float probability) { const T rand_min = T(1); const T rand_max = T(10); // generate a lexicograhically sorted list of (row, column) index tuples // number of nonzeroes cannot be guaranteed as duplicates may exist const int num_nonzeroes_attempt = static_cast( std::min(static_cast(INT_MAX), static_cast(probability * static_cast(size * size)))); std::vector> indices(num_nonzeroes_attempt); { std::vector flat_indices = benchmark_utils::get_random_data(2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt); for(int i = 0; i < num_nonzeroes_attempt; i++) { indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); } std::sort(indices.begin(), indices.end()); } // generate the compressed sparse rows matrix std::pair prev_cell = std::make_pair(-1, -1); int num_nonzeroes = 0; std::vector row_offsets(size + 1); // this vector might be too large, but doing the allocation now eliminates a // scan std::vector column_indices(num_nonzeroes_attempt); row_offsets[0] = 0; int last_row_written = 0; for(int i = 0; i < num_nonzeroes_attempt; i++) { if(indices[i] != prev_cell) { // update the row offets if we go to the next row (or skip some) if(indices[i].first != last_row_written) { for(int j = last_row_written + 1; j <= indices[i].first; j++) { row_offsets[j] = num_nonzeroes; } last_row_written = indices[i].first; } column_indices[num_nonzeroes++] = indices[i].second; prev_cell = indices[i]; } } // fill in the entries for any missing rows for(int j = last_row_written + 1; j < static_cast(size) + 1; j++) { row_offsets[j] = num_nonzeroes; } // generate the random data once the actual number of nonzeroes are known std::vector values = benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); std::vector vector_x = benchmark_utils::get_random_data(size, rand_min, rand_max); T* d_values; int* d_row_offsets; int* d_column_indices; T* d_vector_x; T* d_vector_y; HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_row_offsets, row_offsets.data(), row_offsets.size() * sizeof(int), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_column_indices, column_indices.data(), num_nonzeroes * sizeof(int), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, temp_storage_size_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * (num_nonzeroes + size)); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_vector_y)); HIP_CHECK(hipFree(d_vector_x)); HIP_CHECK(hipFree(d_column_indices)); HIP_CHECK(hipFree(d_row_offsets)); HIP_CHECK(hipFree(d_values)); HIP_CHECK(hipDeviceSynchronize()); } #define CREATE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ std::string("device_spmv_CsrMV.").c_str(), \ &run_benchmark, \ size, \ stream, \ p) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f), \ CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \ CREATE_BENCHMARK(type, 1.0e-2f) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_device_spmv" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(unsigned int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_utils.hpp000066400000000000000000000413721502260333500214170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPCUB_BENCHMARK_UTILS_HPP_ #define HIPCUB_BENCHMARK_UTILS_HPP_ #ifndef BENCHMARK_UTILS_INCLUDE_GUARD #error benchmark_utils.hpp must ONLY be included by common_benchmark_header.hpp. Please include common_benchmark_header.hpp instead. #endif // hipCUB API #ifdef __HIP_PLATFORM_AMD__ #include "hipcub/backend/rocprim/util_ptx.hpp" #elif defined(__HIP_PLATFORM_NVIDIA__) #include "hipcub/config.hpp" #include #endif #include "hipcub/tuple.hpp" #ifndef HIPCUB_CUB_API #define HIPCUB_WARP_THREADS_MACRO warpSize #else #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS #endif namespace benchmark_utils { const size_t default_max_random_size = 1024 * 1024; // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> typename std::enable_if::value, std::vector>::type { std::random_device rd; std::default_random_engine gen(rd()); using distribution_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; std::uniform_int_distribution distribution(min, max); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> typename std::enable_if::value, std::vector>::type { std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution distribution(min, max); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size) { std::random_device rd; std::default_random_engine gen(rd()); std::bernoulli_distribution distribution(p); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline T get_random_value(T min, T max) { return get_random_data(1, min, max)[0]; } // Can't use std::prefix_sum for inclusive/exclusive scan, because // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] // they way we expect. That's because sum in std::prefix_sum's implementation // is of type typename std::iterator_traits::value_type (short) template OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = typename std::conditional::value, input_type, output_type>::type; if(first == last) return d_first; result_type sum = *first; *d_first = sum; while(++first != last) { sum = op(sum, static_cast(*first)); *++d_first = sum; } return ++d_first; } template OutputIt host_exclusive_scan( InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = typename std::conditional::value, input_type, output_type>::type; if(first == last) return d_first; result_type sum = initial_value; *d_first = initial_value; while((first + 1) != last) { sum = op(sum, static_cast(*first)); *++d_first = sum; first++; } return ++d_first; } template OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, T initial_value, OutputIt d_first, BinaryOperation op, KeyCompare key_compare_op) { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = typename std::conditional::value, input_type, output_type>::type; if(first == last) return d_first; result_type sum = initial_value; *d_first = initial_value; while((first + 1) != last) { if(key_compare_op(*k_first, *++k_first)) { sum = op(sum, static_cast(*first)); } else { sum = initial_value; } *++d_first = sum; first++; } return ++d_first; } template struct custom_type { using first_type = T; using second_type = U; T x; U y; HIPCUB_HOST_DEVICE inline constexpr custom_type() : x(T()), y(U()) {} HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) {} HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) {} template HIPCUB_HOST_DEVICE inline custom_type(const custom_type& other) : x(other.x), y(other.y) {} #ifndef HIPCUB_CUB_API HIPCUB_HOST_DEVICE inline ~custom_type() = default; #endif HIPCUB_HOST_DEVICE inline custom_type& operator=(const custom_type& other) { x = other.x; y = other.y; return *this; } HIPCUB_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const { return custom_type(x + rhs.x, y + rhs.y); } HIPCUB_HOST_DEVICE inline custom_type operator-(const custom_type& other) const { return custom_type(x - other.x, y - other.y); } HIPCUB_HOST_DEVICE inline bool operator<(const custom_type& rhs) const { // intentionally suboptimal choice for short-circuting, // required to generate more performant device code return ((x == rhs.x && y < rhs.y) || x < rhs.x); } HIPCUB_HOST_DEVICE inline bool operator>(const custom_type& other) const { return (x > other.x || (x == other.x && y > other.y)); } HIPCUB_HOST_DEVICE inline bool operator==(const custom_type& rhs) const { return x == rhs.x && y == rhs.y; } HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type& other) const { return !(*this == other); } HIPCUB_HOST_DEVICE custom_type& operator+=(const custom_type& rhs) { this->x += rhs.x; this->y += rhs.y; return *this; } }; template struct is_custom_type : std::false_type {}; template struct is_custom_type> : std::true_type {}; template struct custom_type_decomposer { static_assert(is_custom_type::value, "custom_type_decomposer can only be used with instantiations " "of custom_type"); using T = typename CustomType::first_type; using U = typename CustomType::second_type; HIPCUB_HOST_DEVICE ::hipcub::tuple operator()(CustomType& key) const { return ::hipcub::tuple{key.x, key.y}; } }; template struct generate_limits; template struct generate_limits::value>> { static inline T min() { return std::numeric_limits::min(); } static inline T max() { return std::numeric_limits::max(); } }; template struct generate_limits::value>> { using F = typename T::first_type; using S = typename T::second_type; static inline T min() { return T(generate_limits::min(), generate_limits::min()); } static inline T max() { return T(generate_limits::max(), generate_limits::max()); } }; template struct generate_limits::value>> { static inline T min() { return T(-1000); } static inline T max() { return T(1000); } }; template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector data(size); auto fdata = get_random_data(size, min.x, max.x, max_random_size); auto sdata = get_random_data(size, min.y, max.y, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(fdata[i], sdata[i]); } return data; } template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value && !std::is_same::value, std::vector>::type { using field_type = decltype(max.x); std::vector data(size); auto field_data = get_random_data(size, min.x, max.x, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(field_data[i]); } return data; } template std::vector get_random_segments(const size_t size, const size_t max_segment_length, const int seed_value) { static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); std::default_random_engine prng(seed_value); std::uniform_int_distribution segment_length_distribution(max_segment_length); using key_distribution_type = std::conditional_t::value, std::uniform_int_distribution, std::uniform_real_distribution>; key_distribution_type key_distribution(std::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill(std::next(keys.begin(), keys_start_index), std::next(keys.begin(), new_segment_end), key); keys_start_index += new_segment_length; } return keys; } bool is_warp_size_supported(const unsigned required_warp_size) { return HIPCUB_HOST_WARP_THREADS >= required_warp_size; } template __device__ constexpr bool device_test_enabled_for_warp_size_v = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize; template using it_value_t = typename std::iterator_traits::value_type; using engine_type = std::default_random_engine; // generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template inline auto generate_random_data_n( OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) -> typename std::enable_if_t>::value, OutputIter> { using T = it_value_t; using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; std::uniform_int_distribution distribution((T)min, (T)max); std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(it, std::min(size - i, max_random_size), it + i); } return it + size; } template inline auto generate_random_data_n(OutputIterator it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value, OutputIterator> { using T = typename std::iterator_traits::value_type; std::uniform_real_distribution distribution((T)min, (T)max); std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(it, std::min(size - i, max_random_size), it + i); } return it + size; } template struct alignas(Alignment) custom_aligned_type { unsigned char data[Size]; }; template::value && std::is_unsigned::value, int> = 0> inline constexpr auto ceiling_div(const T a, const U b) { return a / b + (a % b > 0 ? 1 : 0); } } // namespace benchmark_utils // Need for hipcub::DeviceReduce::Min/Max etc. namespace std { template<> class numeric_limits> { using T = typename benchmark_utils::custom_type; public: static constexpr inline T min() { return std::numeric_limits::min(); } static constexpr inline T max() { return std::numeric_limits::max(); } static constexpr inline T lowest() { return std::numeric_limits::lowest(); } }; template<> class numeric_limits> { using T = typename benchmark_utils::custom_type; public: static constexpr inline T min() { return std::numeric_limits::min(); } static constexpr inline T max() { return std::numeric_limits::max(); } static constexpr inline T lowest() { return std::numeric_limits::lowest(); } }; } // namespace std #endif // HIPCUB_BENCHMARK_UTILS_HPP_ hipCUB-rocm-6.4.3/benchmark/benchmark_warp_exchange.cpp000066400000000000000000000365311502260333500230660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_exchange.hpp" #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __device__ auto warp_exchange_benchmark(T* d_output) -> std::enable_if_t> { T thread_data[ItemsPerThread]; #pragma unroll for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); } using WarpExchangeT = ::hipcub::WarpExchange; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = threadIdx.x / LogicalWarpSize; WarpExchangeT warp_exchange(temp_storage[warp_id]); Op{}(warp_exchange, thread_data); #pragma unroll for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template __device__ auto warp_exchange_benchmark(T* /*d_output*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T* d_output) { warp_exchange_benchmark(d_output); } template __device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output) -> std::enable_if_t> { const unsigned warp_id = threadIdx.x / LogicalWarpSize; T thread_data[ItemsPerThread]; OffsetT thread_ranks[ItemsPerThread]; #pragma unroll for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); thread_ranks[i] = static_cast(LogicalWarpSize - warp_id * ItemsPerThread - i - 1); } using WarpExchangeT = ::hipcub::WarpExchange; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks); #pragma unroll for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; d_output[striped_global_idx] = thread_data[i]; } } template __device__ auto warp_exchange_scatter_to_striped_benchmark(T* /*d_output*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void warp_exchange_scatter_to_striped_kernel(T* d_output) { warp_exchange_scatter_to_striped_benchmark( d_output); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned trials = 100; constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); T* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < trials; ++i) { warp_exchange_kernel <<>>(d_output); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * trials * size); HIP_CHECK(hipFree(d_output)); } template void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned trials = 100; constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); T* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < trials; ++i) { warp_exchange_scatter_to_striped_kernel <<>>(d_output); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * trials * size); HIP_CHECK(hipFree(d_output)); } struct StripedToBlockedOp { template __device__ void operator()(WarpExchangeT& warp_exchange, T (&thread_data)[ItemsPerThread]) const { warp_exchange.StripedToBlocked(thread_data, thread_data); } }; struct BlockedToStripedOp { template __device__ void operator()(WarpExchangeT& warp_exchange, T (&thread_data)[ItemsPerThread]) const { warp_exchange.BlockedToStriped(thread_data, thread_data); } }; #define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked.") \ .c_str(), \ &run_benchmark, \ stream, \ size) #define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped.") \ .c_str(), \ &run_benchmark, \ stream, \ size) #define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ benchmark::RegisterBenchmark(std::string("warp_exchange_scatter_to_striped.") \ .c_str(), \ &run_benchmark_scatter_to_striped, \ stream, \ size) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_exchange" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), // CUB requires WS == IPT for WARP_EXCHANGE_SHUFFLE #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), #endif }; #ifdef HIPCUB_ROCPRIM_API if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end()); } #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_warp_load.cpp000066400000000000000000000326651502260333500222270ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_load.hpp" #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __device__ auto warp_load_benchmark(T* d_input, T* d_output) -> std::enable_if_t> { using WarpLoadT = ::hipcub::WarpLoad; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = threadIdx.x / LogicalWarpSize; const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; T thread_data[ItemsPerThread]; WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data); #pragma unroll for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; d_output[striped_global_idx] = thread_data[i]; } } template __device__ auto warp_load_benchmark(T* /*d_input*/, T* /*d_output*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void warp_load_kernel(T* d_input, T* d_output) { warp_load_benchmark(d_input, d_output); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < Trials; i++) { warp_load_kernel <<>>(d_input, d_output); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_load.") \ .c_str(), \ &run_benchmark, \ stream, \ size) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "benchmark_warp_load" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE) // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) }; if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE) // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) }; benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end()); } // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_warp_merge_sort.cpp000066400000000000000000000556241502260333500234560ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "../test/hipcub/test_utils_sort_comparator.hpp" // HIP API #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/util_ptx.hpp" #include "hipcub/warp/warp_merge_sort.hpp" #include #ifndef DEFAULT_N constexpr size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs, }; template __device__ auto sort_keys_benchmark(const T* input, T* output, Compare compare_op) -> std::enable_if_t> { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int flat_tid = threadIdx.x; const unsigned int block_offset = blockIdx.x * items_per_block; T keys[ItemsPerThread]; hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort; __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; warp_merge_sort wsort{storage[warp_id]}; wsort.Sort(keys, compare_op); hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } template __device__ auto sort_keys_benchmark(const T* /*input*/, T* /*output*/, Compare /*compare_op*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void sort_keys(const T* input, T* output, Compare compare_op) { sort_keys_benchmark(input, output, compare_op); } template __device__ auto sort_pairs_benchmark(const T* input, T* output, Compare compare_op) -> std::enable_if_t> { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int flat_tid = threadIdx.x; const unsigned int block_offset = blockIdx.x * items_per_block; T keys[ItemsPerThread]; T values[ItemsPerThread]; hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; ++i) { values[i] = keys[i] + T(1); } constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort; __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; warp_merge_sort wsort{storage[warp_id]}; wsort.Sort(keys, values, compare_op); for(unsigned int i = 0; i < ItemsPerThread; ++i) { keys[i] += values[i]; } hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } template __device__ auto sort_pairs_benchmark(const T* /*input*/, T* /*output*/, Compare /*compare_op*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void sort_pairs(const T* input, T* output, Compare compare_op) { sort_pairs_benchmark(input, output, compare_op); } template struct max_value { static constexpr T value = std::numeric_limits::max(); }; template __device__ auto sort_keys_segmented_benchmark(const T* input, T* output, const unsigned int* segment_sizes, Compare compare) -> std::enable_if_t> { constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort; __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; warp_merge_sort wsort{storage[warp_id]}; const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; const unsigned int segment_size = segment_sizes[segment_id]; const unsigned int warp_offset = segment_id * max_segment_size; T keys[ItemsPerThread]; const unsigned int flat_tid = wsort.get_linear_tid(); hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); const T oob_default = max_value::value; wsort.Sort(keys, compare, segment_size, oob_default); hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); } template __device__ auto sort_keys_segmented_benchmark(const T* /*input*/, T* /*output*/, const unsigned int* /*segment_sizes*/, Compare /*compare*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void sort_keys_segmented(const T* input, T* output, const unsigned int* segment_sizes, Compare compare) { sort_keys_segmented_benchmark(input, output, segment_sizes, compare); } template __device__ auto sort_pairs_segmented_benchmark(const T* input, T* output, const unsigned int* segment_sizes, Compare compare) -> std::enable_if_t> { constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort; __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; warp_merge_sort wsort{storage[warp_id]}; const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; const unsigned int segment_size = segment_sizes[segment_id]; const unsigned int warp_offset = segment_id * max_segment_size; T keys[ItemsPerThread]; T values[ItemsPerThread]; const unsigned int flat_tid = wsort.get_linear_tid(); hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); for(unsigned int i = 0; i < ItemsPerThread; ++i) { if(flat_tid * ItemsPerThread + i < segment_size) { values[i] = keys[i] + T(1); } } const T oob_default = max_value::value; wsort.Sort(keys, values, compare, segment_size, oob_default); for(unsigned int i = 0; i < ItemsPerThread; ++i) { if(flat_tid * ItemsPerThread + i < segment_size) { keys[i] += values[i]; } } hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); } template __device__ auto sort_pairs_segmented_benchmark(const T* /*input*/, T* /*output*/, const unsigned int* /*segment_sizes*/, Compare /*compare*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void sort_pairs_segmented(const T* input, T* output, const unsigned int* segment_sizes, Compare compare) { sort_pairs_segmented_benchmark(input, output, segment_sizes, compare); } template void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); const std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); T* d_input = nullptr; T* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { for(unsigned int i = 0; i < Trials; ++i) { sort_keys <<>>(d_input, d_output, CompareOp{}); } } else if(benchmark_kind == benchmark_kinds::sort_pairs) { for(unsigned int i = 0; i < Trials; ++i) { sort_pairs <<>>(d_input, d_output, CompareOp{}); } } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) { constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr auto segments_per_block = BlockSize / LogicalWarpSize; constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; const auto num_segments = num_blocks * segments_per_block; const auto size = num_blocks * items_per_block; const std::vector input = benchmark_utils::get_random_data(size, benchmark_utils::generate_limits::min(), benchmark_utils::generate_limits::max()); const auto segment_sizes = benchmark_utils::get_random_data(num_segments, 0, max_segment_size); T* d_input = nullptr; T* d_output = nullptr; unsigned int* d_segment_sizes = nullptr; HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(), num_segments * sizeof(segment_sizes[0]), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { for(unsigned int i = 0; i < Trials; ++i) { sort_keys_segmented <<>>(d_input, d_output, d_segment_sizes, CompareOp{}); } } else if(benchmark_kind == benchmark_kinds::sort_pairs) { for(unsigned int i = 0; i < Trials; ++i) { sort_pairs_segmented <<>>(d_input, d_output, d_segment_sizes, CompareOp{}); } } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_segment_sizes)); } #define CREATE_BENCHMARK(T, BS, WS, IPT) \ if(WS <= device_warp_size) \ { \ benchmarks.push_back(benchmark::RegisterBenchmark( \ std::string("warp_merge_sort.sub_algorithm_name:" \ + name) \ .c_str(), \ segmented ? &run_benchmark : &run_segmented_benchmark, \ benchmark_kind, \ stream, \ size)); \ } #define BENCHMARK_TYPE_WS(type, block, warp) \ CREATE_BENCHMARK(type, block, warp, 1); \ CREATE_BENCHMARK(type, block, warp, 4); \ CREATE_BENCHMARK(type, block, warp, 8) #define BENCHMARK_TYPE(type, block) \ BENCHMARK_TYPE_WS(type, block, 4); \ BENCHMARK_TYPE_WS(type, block, 16); \ BENCHMARK_TYPE_WS(type, block, 32); \ BENCHMARK_TYPE_WS(type, block, 64) void add_benchmarks(const benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, const hipStream_t stream, const size_t size, const bool segmented, const unsigned int device_warp_size) { BENCHMARK_TYPE(int, 256); BENCHMARK_TYPE(int8_t, 256); BENCHMARK_TYPE(uint8_t, 256); BENCHMARK_TYPE(long long, 256); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_merge_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; const auto device_warp_size = [] { const int result = HIPCUB_HOST_WARP_THREADS; if(result > 0) { std::cout << "[HIP] Device warp size: " << result << std::endl; } else { std::cerr << "Failed to get device warp size! Aborting.\n"; std::exit(1); } return static_cast(result); }(); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size, false, device_warp_size); add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size, false, device_warp_size); add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)", benchmarks, stream, size, true, device_warp_size); add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)", benchmarks, stream, size, true, device_warp_size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_warp_reduce.cpp000066400000000000000000000244031502260333500225460ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(64) auto warp_reduce_kernel(const T* d_input, T* d_output) -> std::enable_if_t> { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = d_input[i]; using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; auto reduce_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).Reduce(value, reduce_op); } d_output[i] = value; } template __global__ __launch_bounds__(64) auto warp_reduce_kernel(const T* /*d_input*/, T* /*d_output*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(64) auto segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) -> std::enable_if_t> { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = d_input[i]; auto flag = d_flags[i]; using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).HeadSegmentedSum(value, flag); } d_output[i] = value; } template __global__ __launch_bounds__(64) auto segmented_warp_reduce_kernel(const T* /*d_input*/, Flag* /*d_flags*/, T* /*d_output*/) -> std::enable_if_t> {} template inline auto execute_warp_reduce_kernel( T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, output); HIP_CHECK(hipPeekAtLastError()); } template inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_warp_reduce_kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, flags, output); HIP_CHECK(hipPeekAtLastError()); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { using flag_type = unsigned char; const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); std::vector flags = benchmark_utils::get_random_data(size, 0, 1); T* d_input; flag_type* d_flags; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); execute_warp_reduce_kernel(d_input, d_output, d_flags, size, stream); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_flags)); } #define CREATE_BENCHMARK(T, WS, BS) \ benchmark::RegisterBenchmark(std::string("warp_reduce.sub_algorithm_name:" \ + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) // If warp size limit is 16 #define BENCHMARK_TYPE_WS16(type) CREATE_BENCHMARK(type, 15, 32), CREATE_BENCHMARK(type, 16, 32) // If warp size limit is 32 #define BENCHMARK_TYPE_WS32(type) \ BENCHMARK_TYPE_WS16(type), CREATE_BENCHMARK(type, 31, 32), CREATE_BENCHMARK(type, 32, 32), \ CREATE_BENCHMARK(type, 32, 64) // If warp size limit is 64 #define BENCHMARK_TYPE_WS64(type) \ BENCHMARK_TYPE_WS32(type), CREATE_BENCHMARK(type, 37, 64), CREATE_BENCHMARK(type, 61, 64), \ CREATE_BENCHMARK(type, 64, 64) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { #if HIPCUB_WARP_THREADS_MACRO == 16 BENCHMARK_TYPE_WS16(int), BENCHMARK_TYPE_WS16(float), BENCHMARK_TYPE_WS16(double), BENCHMARK_TYPE_WS16(int8_t), BENCHMARK_TYPE_WS16(uint8_t) #elif HIPCUB_WARP_THREADS_MACRO == 32 BENCHMARK_TYPE_WS32(int), BENCHMARK_TYPE_WS32(float), BENCHMARK_TYPE_WS32(double), BENCHMARK_TYPE_WS32(int8_t), BENCHMARK_TYPE_WS32(uint8_t) #else BENCHMARK_TYPE_WS64(int), BENCHMARK_TYPE_WS64(float), BENCHMARK_TYPE_WS64(double), BENCHMARK_TYPE_WS64(int8_t), BENCHMARK_TYPE_WS64(uint8_t) #endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("reduce", benchmarks, stream, size); add_benchmarks("segmented_reduce", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_warp_scan.cpp000066400000000000000000000247671502260333500222400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_scan.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif enum class scan_type { inclusive_scan, exclusive_scan, broadcast }; template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) { Runner::template run(input, output, init); } struct inclusive_scan { template __device__ static auto run(const T* input, T* output, const T init) -> std::enable_if_t> { (void)init; const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t(storage).InclusiveScan(value, value, scan_op); } output[i] = value; } template __device__ static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) -> std::enable_if_t> {} }; struct exclusive_scan { template __device__ static auto run(const T* input, T* output, const T init) -> std::enable_if_t> { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t(storage).ExclusiveScan(value, value, init, scan_op); } output[i] = value; } template __device__ static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) -> std::enable_if_t> {} }; struct broadcast { template __device__ static auto run(const T* input, T* output, const T init) -> std::enable_if_t> { (void)init; const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wscan_t(storage).Broadcast(value, 0); } output[i] = value; } template __device__ static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) -> std::enable_if_t> {} }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1) / BlockSize); // Allocate and fill memory std::vector input(size, 1.0f); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, d_input, d_output, input[0]); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ benchmark::RegisterBenchmark(std::string("warp_scan.sub_algorithm_name:" \ + method_name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark) // clang-format off // If warp size limit is 16 #define BENCHMARK_TYPE_WS16(type) \ CREATE_BENCHMARK(type, 60, 15), \ CREATE_BENCHMARK(type, 256, 16) // If warp size limit is 32 #define BENCHMARK_TYPE_WS32(type) \ BENCHMARK_TYPE_WS16(type), \ CREATE_BENCHMARK(type, 62, 31), \ CREATE_BENCHMARK(type, 256, 32) // If warp size limit is 64 #define BENCHMARK_TYPE_WS64(type) \ BENCHMARK_TYPE_WS32(type), \ CREATE_BENCHMARK(type, 63, 63), \ CREATE_BENCHMARK(type, 64, 64), \ CREATE_BENCHMARK(type, 128, 64), \ CREATE_BENCHMARK(type, 256, 64) // clang-format on template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t size) { using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; std::vector new_benchmarks = { #if HIPCUB_WARP_THREADS_MACRO == 16 BENCHMARK_TYPE_WS16(int), BENCHMARK_TYPE_WS16(float), BENCHMARK_TYPE_WS16(double), BENCHMARK_TYPE_WS16(int8_t), BENCHMARK_TYPE_WS16(custom_double2), BENCHMARK_TYPE_WS16(custom_int_double) #elif HIPCUB_WARP_THREADS_MACRO == 32 BENCHMARK_TYPE_WS32(int), BENCHMARK_TYPE_WS32(float), BENCHMARK_TYPE_WS32(double), BENCHMARK_TYPE_WS32(int8_t), BENCHMARK_TYPE_WS32(custom_double2), BENCHMARK_TYPE_WS32(custom_int_double) #else BENCHMARK_TYPE_WS64(int), BENCHMARK_TYPE_WS64(float), BENCHMARK_TYPE_WS64(double), BENCHMARK_TYPE_WS64(int8_t), BENCHMARK_TYPE_WS64(custom_double2), BENCHMARK_TYPE_WS64(custom_int_double) #endif }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_scan" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, "inclusive_scan", stream, size); add_benchmarks(benchmarks, "exclusive_scan", stream, size); add_benchmarks(benchmarks, "broadcast", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/benchmark_warp_store.cpp000066400000000000000000000322171502260333500224350ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_store.hpp" #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __device__ auto warp_store_benchmark(T* d_output) -> std::enable_if_t> { T thread_data[ItemsPerThread]; #pragma unroll for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); } using WarpStoreT = ::hipcub::WarpStore; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = threadIdx.x / LogicalWarpSize; const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; WarpStoreT(temp_storage[warp_id]).Store(d_output + global_warp_id * tile_size, thread_data); } template __device__ auto warp_store_benchmark(T* /*d_output*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void warp_store_kernel(T* d_output) { warp_store_benchmark(d_output); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); T* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < Trials; ++i) { warp_store_kernel <<>>(d_output); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark(std::string("warp_store.") \ .c_str(), \ &run_benchmark, \ stream, \ size) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_store" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) }; if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 16, 64, // ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 64, // ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) }; benchmarks.insert(benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end()); } // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-6.4.3/benchmark/cmdparser.hpp000066400000000000000000000417201502260333500202220ustar00rootroot00000000000000// The MIT License (MIT) // // Copyright (c) 2015 - 2016 Florian Rappl // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /* This file is part of the C++ CmdParser utility. Copyright (c) 2015 - 2016 Florian Rappl */ #pragma once #include #include #include #include #include #include namespace cli { struct CallbackArgs { const std::vector& arguments; std::ostream& output; std::ostream& error; }; class Parser { private: class CmdBase { public: explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : name(name), command(name.size() > 0 ? "-" + name : ""), alternative(alternative.size() > 0 ? "--" + alternative : ""), description(description), required(required), handled(false), arguments({}), dominant(dominant), variadic(variadic) { } virtual ~CmdBase() { } std::string name; std::string command; std::string alternative; std::string description; bool required; bool handled; std::vector arguments; bool const dominant; bool const variadic; virtual std::string print_value() const = 0; virtual bool parse(std::ostream& output, std::ostream& error) = 0; bool is(const std::string& given) const { return given == command || given == alternative; } }; template struct ArgumentCountChecker { static constexpr bool Variadic = false; }; template struct ArgumentCountChecker> { static constexpr bool Variadic = true; }; template class CmdFunction final : public CmdBase { public: explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream& output, std::ostream& error) override { try { CallbackArgs args { arguments, output, error }; value = callback(args); return true; } catch (...) { return false; } } virtual std::string print_value() const override { return ""; } std::function callback; T value; }; template class CmdArgument final : public CmdBase { public: explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic), value(T()) { } virtual bool parse(std::ostream&, std::ostream&) override { try { value = Parser::parse(arguments, value); return true; } catch (...) { return false; } } virtual std::string print_value() const override { return stringify(value); } T value; }; static int parse(const std::vector& elements, const int&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoi(elements[0]); } static bool parse(const std::vector& elements, const bool& defval) { if (elements.size() != 0) throw std::runtime_error("A boolean command line parameter cannot have any arguments."); return !defval; } static double parse(const std::vector& elements, const double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stod(elements[0]); } static float parse(const std::vector& elements, const float&) { if (elements.size() != 1) throw std::bad_cast(); return std::stof(elements[0]); } static long double parse(const std::vector& elements, const long double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stold(elements[0]); } static unsigned int parse(const std::vector& elements, const unsigned int&) { if (elements.size() != 1) throw std::bad_cast(); return static_cast(std::stoul(elements[0])); } static unsigned long parse(const std::vector& elements, const unsigned long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoul(elements[0]); } static unsigned long long parse(const std::vector& elements, const unsigned long long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoull(elements[0]); } static long parse(const std::vector& elements, const long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stol(elements[0]); } static std::string parse(const std::vector& elements, const std::string&) { if (elements.size() != 1) throw std::bad_cast(); return elements[0]; } template static std::vector parse(const std::vector& elements, const std::vector&) { const T defval = T(); std::vector values { }; std::vector buffer(1); for (const auto& element : elements) { buffer[0] = element; values.push_back(parse(buffer, defval)); } return values; } template static std::string stringify(const T& value) { return std::to_string(value); } template static std::string stringify(const std::vector& values) { std::stringstream ss { }; ss << "[ "; for (const auto& value : values) { ss << stringify(value) << " "; } ss << "]"; return ss.str(); } static std::string stringify(const std::string& str) { return str; } public: explicit Parser(int argc, const char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } explicit Parser(int argc, char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } ~Parser() { for (int i = 0, n = _commands.size(); i < n; ++i) { delete _commands[i]; } } bool has_help() const { for (const auto command : _commands) { if (command->name == "h" && command->alternative == "--help") { return true; } } return false; } void enable_help() { set_callback("h", "help", std::function([this](CallbackArgs& args){ args.output << this->usage(); /*exit(0);*/ return false; }), "", true); } void disable_help() { for (auto command = _commands.begin(); command != _commands.end(); ++command) { if ((*command)->name == "h" && (*command)->alternative == "--help") { _commands.erase(command); break; } } } template void set_default(bool is_required, const std::string& description = "") { auto command = new CmdArgument { "", "", description, is_required, false }; _commands.push_back(command); } template void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, true, dominant }; _commands.push_back(command); } template void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, false, dominant }; command->value = defaultValue; _commands.push_back(command); } template void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { auto command = new CmdFunction { name, alternative, description, false, dominant }; command->callback = callback; _commands.push_back(command); } inline void run_and_exit_if_error() { if (run() == false) { exit(1); } } inline bool run() { return run(std::cout, std::cerr); } inline bool run(std::ostream& output) { return run(output, std::cerr); } bool run(std::ostream& output, std::ostream& error) { if (_arguments.size() > 0) { auto current = find_default(); for (int i = 0, n = _arguments.size(); i < n; ++i) { auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; auto associated = isarg ? find(_arguments[i]) : nullptr; if (associated != nullptr) { current = associated; associated->handled = true; } else if (current == nullptr) { current = find(_arguments[i]); // Code was commented out so cmdparser can ignore unknown options // error << no_default(); // return false; } else { current->arguments.push_back(_arguments[i]); current->handled = true; if (!current->variadic) { // If the current command is not variadic, then no more arguments // should be added to it. In this case, switch back to the default // command. current = find_default(); } } } } // First, parse dominant arguments since they succeed even if required // arguments are missing. for (auto command : _commands) { if (command->handled && command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } // Next, check for any missing arguments. for (auto command : _commands) { if (command->required && !command->handled) { error << howto_required(command); return false; } } // Finally, parse all remaining arguments. for (auto command : _commands) { if (command->handled && !command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } return true; } template T get(const std::string& name) const { for (const auto& command : _commands) { if (command->name == name) { auto cmd = dynamic_cast*>(command); if (cmd == nullptr) { throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); } return cmd->value; } } throw std::runtime_error("The parameter " + name + " could not be found."); } template T get_if(const std::string& name, std::function callback) const { auto value = get(name); return callback(value); } int requirements() const { int count = 0; for (const auto& command : _commands) { if (command->required) { ++count; } } return count; } int commands() const { return static_cast(_commands.size()); } inline const std::string& app_name() const { return _appname; } protected: CmdBase* find(const std::string& name) { for (auto command : _commands) { if (command->is(name)) { return command; } } return nullptr; } CmdBase* find_default() { for (auto command : _commands) { if (command->name == "") { return command; } } return nullptr; } std::string usage() const { std::stringstream ss { }; ss << "Available parameters:\n\n"; for (const auto& command : _commands) { ss << " " << command->command << "\t" << command->alternative; if (command->required == true) { ss << "\t(required)"; } ss << "\n " << command->description; if (command->required == false) { ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; } ss << "\n\n"; } return ss.str(); } void print_help(std::stringstream& ss) const { if (has_help()) { ss << "For more help use --help or -h.\n"; } } std::string howto_required(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " is required.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string howto_use(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " has invalid arguments.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string no_default() const { std::stringstream ss { }; ss << "No default parameter has been specified.\n"; ss << "The given argument must be used with a parameter.\n"; print_help(ss); return ss.str(); } private: const std::string _appname; std::vector _arguments; std::vector _commands; }; } hipCUB-rocm-6.4.3/benchmark/common_benchmark_header.hpp000066400000000000000000000040641502260333500230540ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include #include #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" // HIP API #include // benchmark_utils.hpp should only be included by this header. // The following definition is used as guard in benchmark_utils.hpp // Including benchmark_utils.hpp by itself will cause a compile error. #define BENCHMARK_UTILS_INCLUDE_GUARD #include "benchmark_utils.hpp" #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess){ \ std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ exit(error); \ } \ } hipCUB-rocm-6.4.3/cmake/000077500000000000000000000000001502260333500146535ustar00rootroot00000000000000hipCUB-rocm-6.4.3/cmake/Dependencies.cmake000066400000000000000000000227541502260333500202550ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ########################### # hipCUB dependencies # ########################### # NOTE1: the reason we don't scope global state meddling using add_subdirectory # is because CMake < 3.24 lacks CMAKE_FIND_PACKAGE_TARGETS_GLOBAL which # would promote IMPORTED targets of find_package(CONFIG) to be visible # by other parts of the build. So we save and restore global state. # # NOTE2: We disable the ROCMChecks.cmake warning noting that we meddle with # global state. This is consequence of abusing the CMake CXX language # which HIP piggybacks on top of. This kind of HIP support has one chance # at observing the global flags, at the find_package(HIP) invocation. # The device compiler won't be able to pick up changes after that, hence # the warning. # # NOTE3: hipCUB and rocPRIM share CMake options for building tests, benchmarks # and examples. Until that's not fixed, we have to save/restore them. set(USER_CXX_FLAGS ${CMAKE_CXX_FLAGS}) if(DEFINED BUILD_SHARED_LIBS) set(USER_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) endif() set(USER_ROCM_WARN_TOOLCHAIN_VAR ${ROCM_WARN_TOOLCHAIN_VAR}) set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "") # Turn off warnings and errors for all warnings in dependencies separate_arguments(CXX_FLAGS_LIST NATIVE_COMMAND ${CMAKE_CXX_FLAGS}) list(REMOVE_ITEM CXX_FLAGS_LIST /WX -Werror -Werror=pendantic -pedantic-errors) if(MSVC) list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "/[Ww]([0-4]?)(all)?") # Remove MSVC warning flags list(APPEND CXX_FLAGS_LIST /w) else() list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "-W(all|extra|everything)") # Remove GCC/LLVM flags list(APPEND CXX_FLAGS_LIST -w) endif() list(JOIN CXX_FLAGS_LIST " " CMAKE_CXX_FLAGS) # Don't build client dependencies as shared set(BUILD_SHARED_LIBS OFF CACHE BOOL "Global flag to cause add_library() to create shared libraries if on." FORCE) foreach(SHARED_OPTION BUILD_TEST BUILD_BENCHMARK BUILD_EXAMPLE) set(USER_${SHARED_OPTION} ${${SHARED_OPTION}}) set(${SHARED_OPTION} OFF) endforeach() include(FetchContent) # Test dependencies if(USER_BUILD_TEST) # NOTE1: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake # # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND # # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main # # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets # will be duplicately defined. # # NOTE3: The following snippet first tries to find Google Test binary either in MODULE or CONFIG modes. # If neither succeeds it goes on to import Google Test into this build either from a system # source package (apt install googletest on Ubuntu 18.04 only) or GitHub and defines the MODULE # mode targets. Otherwise if MODULE or CONFIG succeeded, then it prints the result to the # console via a non-QUIET find_package call and if CONFIG succeeded, creates ALIAS targets # with the MODULE IMPORTED names. if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(GTest QUIET) endif() if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest) option(BUILD_GTEST "Builds the googletest subproject" ON) option(BUILD_GMOCK "Builds the googlemock subproject" OFF) option(INSTALL_GTEST "Enable installation of googletest." OFF) if(EXISTS /usr/src/googletest AND NOT DEPENDENCIES_FORCE_DOWNLOAD) FetchContent_Declare( googletest SOURCE_DIR /usr/src/googletest ) else() message(STATUS "Google Test not found. Fetching...") FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG release-1.11.0 ) endif() FetchContent_MakeAvailable(googletest) add_library(GTest::GTest ALIAS gtest) add_library(GTest::Main ALIAS gtest_main) else() find_package(GTest REQUIRED) if(TARGET GTest::gtest_main AND NOT TARGET GTest::Main) add_library(GTest::GTest ALIAS GTest::gtest) add_library(GTest::Main ALIAS GTest::gtest_main) endif() endif() endif(USER_BUILD_TEST) if(USER_BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(benchmark CONFIG QUIET) endif() if(NOT TARGET benchmark::benchmark) message(STATUS "Google Benchmark not found. Fetching...") option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." OFF) option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark." OFF) FetchContent_Declare( googlebench GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.8.0 ) FetchContent_MakeAvailable(googlebench) if(NOT TARGET benchmark::benchmark) add_library(benchmark::benchmark ALIAS benchmark) endif() else() find_package(benchmark CONFIG REQUIRED) endif() endif(USER_BUILD_BENCHMARK) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") set(CCCL_MINIMUM_VERSION 2.5.0) if(NOT DOWNLOAD_CUB) find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG) find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG) find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG) endif() if (NOT CUB_FOUND OR NOT Thrust_FOUND OR NOT libcudacxx_FOUND) if(CUB_FOUND OR Thrust_FOUND OR libcudacxx_FOUND) message(WARNING "Found one of CUB, Thrust or libcu++, but not all of them. This can lead to mixing different potentially incompatible versions.") endif() message(STATUS "CUB, Thrust or libcu++ not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}") file(DOWNLOAD https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip STATUS cccl_download_status LOG cccl_download_log) list(GET cccl_download_status 0 cccl_download_error_code) if(cccl_download_error_code) message(FATAL_ERROR "Error: downloading " "https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip failed " "error_code: ${cccl_download_error_code} " "log: ${cccl_download_log}") endif() if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) file(ARCHIVE_EXTRACT INPUT ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip) else() execute_process(COMMAND "${CMAKE_COMMAND}" -E tar xf ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE cccl_unpack_error_code) if(cccl_unpack_error_code) message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip failed") endif() endif() find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/cub) find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/thrust) find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/libcudacxx) endif() else() # rocPRIM (only for ROCm platform) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) # Add default install location for WIN32 and non-WIN32 as hint find_package(rocprim CONFIG QUIET PATHS "${ROCM_ROOT}/lib/cmake/rocprim") endif() if(NOT TARGET roc::rocprim) message(STATUS "rocPRIM not found. Fetching...") FetchContent_Declare( prim GIT_REPOSITORY https://github.com/ROCm/rocPRIM.git GIT_TAG develop ) FetchContent_MakeAvailable(prim) if(NOT TARGET roc::rocprim) add_library(roc::rocprim ALIAS rocprim) endif() if(NOT TARGET roc::rocprim_hip) add_library(roc::rocprim_hip ALIAS rocprim_hip) endif() else() find_package(rocprim CONFIG REQUIRED) endif() endif() foreach(SHARED_OPTION BUILD_TEST BUILD_BENCHMARK BUILD_EXAMPLE) set(${SHARED_OPTION} ${USER_${SHARED_OPTION}}) endforeach() # Restore user global state set(CMAKE_CXX_FLAGS ${USER_CXX_FLAGS}) if(DEFINED USER_BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ${USER_BUILD_SHARED_LIBS}) else() unset(BUILD_SHARED_LIBS CACHE ) endif() set(ROCM_WARN_TOOLCHAIN_VAR ${USER_ROCM_WARN_TOOLCHAIN_VAR} CACHE BOOL "") hipCUB-rocm-6.4.3/cmake/GenerateResourceSpec.cmake000066400000000000000000000066221502260333500217400ustar00rootroot00000000000000#!/usr/bin/cmake -P find_program(ROCMINFO_EXECUTABLE rocminfo ) if(NOT ROCMINFO_EXECUTABLE) message(FATAL_ERROR "rocminfo not found") endif() execute_process( COMMAND ${ROCMINFO_EXECUTABLE} RESULT_VARIABLE ROCMINFO_EXIT_CODE OUTPUT_VARIABLE ROCMINFO_STDOUT ERROR_VARIABLE ROCMINFO_STDERR ) if(ROCMINFO_EXIT_CODE) message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}") message(FATAL_ERROR ${ROCMINFO_STDERR}) endif() string(REGEX MATCHALL [[--(gfx[0-9]+)]] ROCMINFO_MATCHES ${ROCMINFO_STDOUT} ) # NOTE: Unfortunately we don't have structs in CMake, # neither do we have std::partition only list(SORT) # # Transform raw regex matches to pairs of gfx IP and device id # This will be our struct emulation. In C++ it would be # # struct device # { # std::string ip; # int id; # }; # # std::vector GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} }; # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(), # [](const device& lhs, const device& rhs) # { # return std::lexicographical_compare(lhs.ip.begin(), lhs.ip.end(), # rhs.ip.begin(), rhs.ip.end()); # }); # set(GFXIP_AND_ID) set(ID 0) foreach(ROCMINFO_MATCH IN LISTS ROCMINFO_MATCHES) string(REGEX REPLACE "--" "" ROCMINFO_MATCH ${ROCMINFO_MATCH} ) list(APPEND GFXIP_AND_ID "${ROCMINFO_MATCH}:${ID}") math(EXPR ID "${ID} + 1") endforeach() list(SORT GFXIP_AND_ID) # Now comes the tricky part: implementing the following C++ logic # # std::stringstream JSON_PAYLOAD; # auto it = GFXIP_AND_ID.begin(); # while (it != GFXIP_AND_ID.end()) # { # auto IT = std::find_if(it, GFXIP_AND_ID.end(), # [=](const device& ip_id){ return ip_id.ip.compare(it->ip) != 0; }); # JSON_PAYLOAD << "\n \"" << it->ip << "\": ["; # std::for_each(it, IT, [&](const device& ip_id) # { # JSON_PAYLOAD << # "\n {\n" << # " \"id\": \"" << ip_id.id << "\"\n" << # " },"; # }); # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # JSON_PAYLOAD << "\n ],"; # it = IT; # } # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # set(JSON_PAYLOAD) set(IT1 0) list(GET GFXIP_AND_ID ${IT1} I1) string(REGEX REPLACE ":[0-9]+" "" IP1 ${I1}) list(LENGTH GFXIP_AND_ID COUNT) while(IT1 LESS COUNT) string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [") set(IT2 ${IT1}) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT) string(APPEND JSON_PAYLOAD "\n {\n" " \"id\": \"${ID2}\"\n" " }," ) math(EXPR IT2 "${IT2} + 1") if(IT2 LESS COUNT) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) endif() endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) string(APPEND JSON_PAYLOAD "\n ],") set(IT1 ${IT2}) set(IP1 ${IP2}) endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) set(JSON_HEAD [[{ "version": { "major": 1, "minor": 0 }, "local": [ {]] ) set(JSON_TAIL [[ } ] }]] ) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/resources.json ${JSON_HEAD} ${JSON_PAYLOAD} ${JSON_TAIL} ) hipCUB-rocm-6.4.3/cmake/ROCMExportTargetsHeaderOnly.cmake000066400000000000000000000133251502260333500231300ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # This file is a worksaround for issues rocm-cmake packaging style and PyTorch. # TODO: remove when there is a fix for this issue in either rocm-cmake or PyTorch. include(CMakeParseArguments) include(GNUInstallDirs) include(ROCMPackageConfigHelpers) include(ROCMInstallTargets) set(ROCM_INSTALL_LIBDIR lib) function(rocm_write_package_template_function_if FILENAME NAME CHECK_VARIABLE) string(REPLACE ";" " " ARGS "${ARGN}") file(APPEND ${FILENAME} " if(NOT (DEFINED ${CHECK_VARIABLE} AND ${CHECK_VARIABLE}) ) ${NAME}(${ARGS}) endif() ") endfunction() function(rocm_export_targets_header_only) set(options) set(oneValueArgs NAMESPACE EXPORT NAME COMPATIBILITY PREFIX) set(multiValueArgs TARGETS DEPENDS INCLUDE) cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(PACKAGE_NAME ${PROJECT_NAME}) if(PARSE_NAME) set(PACKAGE_NAME ${PARSE_NAME}) endif() string(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UPPER) string(TOLOWER ${PACKAGE_NAME} PACKAGE_NAME_LOWER) set(TARGET_FILE ${PACKAGE_NAME_LOWER}-targets) if(PARSE_EXPORT) set(TARGET_FILE ${PARSE_EXPORT}) endif() set(CONFIG_NAME ${PACKAGE_NAME_LOWER}-config) set(TARGET_VERSION ${PROJECT_VERSION}) if(PARSE_PREFIX) set(PREFIX_DIR ${PARSE_PREFIX}) set(PREFIX_ARG PREFIX ${PREFIX_DIR}) set(BIN_INSTALL_DIR ${PREFIX_DIR}/${CMAKE_INSTALL_BINDIR}) set(LIB_INSTALL_DIR ${PREFIX_DIR}/${ROCM_INSTALL_LIBDIR}) set(INCLUDE_INSTALL_DIR ${PREFIX_DIR}/${CMAKE_INSTALL_INCLUDEDIR}) else() set(BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}) set(LIB_INSTALL_DIR ${ROCM_INSTALL_LIBDIR}) set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}) endif() set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/${PACKAGE_NAME_LOWER}) set(CONFIG_TEMPLATE "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_LOWER}-config.cmake.in") file(WRITE ${CONFIG_TEMPLATE} " @PACKAGE_INIT@ ") foreach(NAME ${PACKAGE_NAME} ${PACKAGE_NAME_UPPER} ${PACKAGE_NAME_LOWER}) rocm_write_package_template_function(${CONFIG_TEMPLATE} set_and_check ${NAME}_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@") rocm_write_package_template_function(${CONFIG_TEMPLATE} set_and_check ${NAME}_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIR@") endforeach() rocm_write_package_template_function(${CONFIG_TEMPLATE} set_and_check ${PACKAGE_NAME}_TARGET_FILE "@PACKAGE_CONFIG_PACKAGE_INSTALL_DIR@/${TARGET_FILE}.cmake") if(PARSE_DEPENDS) rocm_list_split(PARSE_DEPENDS PACKAGE DEPENDS_LIST) foreach(DEPEND ${DEPENDS_LIST}) rocm_write_package_template_function(${CONFIG_TEMPLATE} find_dependency ${${DEPEND}}) endforeach() endif() foreach(INCLUDE ${PARSE_INCLUDE}) rocm_install(FILES ${INCLUDE} DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}) get_filename_component(INCLUDE_BASE ${INCLUDE} NAME) rocm_write_package_template_function(${CONFIG_TEMPLATE} include "\${CMAKE_CURRENT_LIST_DIR}/${INCLUDE_BASE}") endforeach() if(PARSE_TARGETS) rocm_write_package_template_function(${CONFIG_TEMPLATE} include "\${${PACKAGE_NAME}_TARGET_FILE}") foreach(NAME ${PACKAGE_NAME} ${PACKAGE_NAME_UPPER} ${PACKAGE_NAME_LOWER}) rocm_write_package_template_function_if(${CONFIG_TEMPLATE} set PYTORCH_FOUND_HIP ${NAME}_LIBRARIES ${PARSE_TARGETS}) rocm_write_package_template_function_if(${CONFIG_TEMPLATE} set PYTORCH_FOUND_HIP ${NAME}_LIBRARY ${PARSE_TARGETS}) endforeach() endif() rocm_configure_package_config_file( ${CONFIG_TEMPLATE} ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}.cmake INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} ${PREFIX_ARG} PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR CONFIG_PACKAGE_INSTALL_DIR ) set(COMPATIBILITY_ARG SameMajorVersion) if(PARSE_COMPATIBILITY) set(COMPATIBILITY_ARG ${PARSE_COMPATIBILITY}) endif() write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}-version.cmake VERSION ${TARGET_VERSION} COMPATIBILITY ${COMPATIBILITY_ARG} ) set(NAMESPACE_ARG) if(PARSE_NAMESPACE) set(NAMESPACE_ARG "NAMESPACE;${PARSE_NAMESPACE}") endif() rocm_install( EXPORT ${TARGET_FILE} DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} ${NAMESPACE_ARG} ) rocm_install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}.cmake ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}-version.cmake DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}) endfunction() hipCUB-rocm-6.4.3/cmake/ROCmCMakeBuildToolsDependency.cmake000066400000000000000000000043341502260333500233620ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_ROOT}") endif() if(NOT ROCM_FOUND) message(STATUS "ROCm CMake not found. Fetching...") # We don't really want to consume the build and test targets of ROCm CMake. # CMake 3.18 allows omitting them, even though there's a CMakeLists.txt in source root. if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set(SOURCE_SUBDIR_ARG SOURCE_SUBDIR "DISABLE ADDING TO BUILD") else() set(SOURCE_SUBDIR_ARG) endif() include(FetchContent) FetchContent_Declare( rocm-cmake URL https://github.com/RadeonOpenCompute/rocm-cmake/archive/refs/tags/rocm-5.2.0.tar.gz ${SOURCE_SUBDIR_ARG} ) FetchContent_MakeAvailable(rocm-cmake) find_package(ROCM CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}") else() find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_ROOT}") endif() include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMHeaderWrapper) include(ROCMCheckTargetIds) include(ROCMClients) hipCUB-rocm-6.4.3/cmake/SetupNVCC.cmake000066400000000000000000000125721502260333500174360ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Find HIP package and verify that correct C++ compiler was selected for available # platform. On ROCm platform host and device code is compiled by the same compiler: # hipcc or clang. On CUDA host can be compiled by any C++ compiler while device # code is compiled by nvcc compiler (CMake's CUDA package handles this). # A function for automatic detection of the CC of the installed NV GPUs function(hip_cuda_detect_cc out_variable) set(__cufile ${PROJECT_BINARY_DIR}/detect_nvgpus_cc.cu) file(WRITE ${__cufile} "" "#include \n" "#include \n" "int main()\n" "{\n" " int count = 0;\n" " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" " if (count == 0) return -1;\n" " std::set list_cc;\n" " for (int device = 0; device < count; ++device)\n" " {\n" " cudaDeviceProp prop;\n" " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" " list_cc.insert(prop.major*10+prop.minor);\n" " }\n" " for (std::set::iterator itr = list_cc.begin(); itr != list_cc.end(); itr++)\n" " {\n" " if(itr != list_cc.begin()) std::cout << ';';\n" " std::cout << *itr;\n" " }\n" " return 0;\n" "}\n") execute_process( COMMAND ${HIP_HIPCC_EXECUTABLE} "-Wno-deprecated-gpu-targets" "--run" "${__cufile}" WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out ) if(__nvcc_res EQUAL 0) set(HIP_CUDA_detected_cc ${__nvcc_out} CACHE INTERNAL "The detected CC of installed NV GPUs" FORCE) endif() if(NOT HIP_CUDA_detected_cc) set(HIP_CUDA_detected_cc "53") set(${out_variable} ${HIP_CUDA_detected_cc} PARENT_SCOPE) else() set(${out_variable} ${HIP_CUDA_detected_cc} PARENT_SCOPE) endif() endfunction() ################################################################################################ ### Non macro/function section ################################################################################################ # Set the default value for CMAKE_CUDA_COMPILER if it's empty if(CMAKE_CUDA_COMPILER STREQUAL "") set(CMAKE_CUDA_COMPILER "nvcc") endif() # Get CUDA enable_language("CUDA") set(CMAKE_CUDA_STANDARD 14) # Suppressing warnings set(HIP_NVCC_FLAGS " ${HIP_NVCC_FLAGS} -Wno-deprecated-gpu-targets -Xcompiler -Wno-return-type -Wno-deprecated-declarations ") # Use NVGPU_TARGETS to set CUDA architectures (compute capabilities) # For example: -DNVGPU_TARGETS="50;61;62" set(DEFAULT_NVGPU_TARGETS "") # If NVGPU_TARGETS is empty get default value for it if("x${NVGPU_TARGETS}" STREQUAL "x") hip_cuda_detect_cc(detected_cc) set(DEFAULT_NVGPU_TARGETS "${detected_cc}") endif() set(NVGPU_TARGETS "${DEFAULT_NVGPU_TARGETS}" CACHE STRING "List of NVIDIA GPU targets (compute capabilities), for example \"35;50\"" ) set(CMAKE_CUDA_ARCHITECTURES ${NVGPU_TARGETS}) if (NOT _HIPCUB_HIP_NVCC_FLAGS_SET) execute_process( COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --cpp_config OUTPUT_VARIABLE HIP_CPP_CONFIG_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE ) # Generate compiler flags based on targeted CUDA architectures if CMake doesn't. (Controlled by policy CP0104, on by default after 3.18) if(CMAKE_VERSION VERSION_LESS "3.18") foreach(CUDA_ARCH ${NVGPU_TARGETS}) list(APPEND HIP_NVCC_FLAGS "--generate-code" "arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}") list(APPEND HIP_NVCC_FLAGS "--generate-code" "arch=compute_${CUDA_ARCH},code=compute_${CUDA_ARCH}") endforeach() endif() # Update list parameter list(JOIN HIP_NVCC_FLAGS " " HIP_NVCC_FLAGS) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}" CACHE STRING "Cuda compile flags" FORCE) set(_HIPCUB_HIP_NVCC_FLAGS_SET ON CACHE INTERNAL "") endif() # Ignore warnings about #pragma unroll # and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_CPP_CONFIG_FLAGS_STRIP} -Wno-unknown-pragmas -Wno-deprecated-declarations" CACHE STRING "compile flags" FORCE) hipCUB-rocm-6.4.3/cmake/Summary.cmake000066400000000000000000000062721502260333500173210ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(print_configuration_summary) message(STATUS "") message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") message(STATUS " HIP ROOT : ${HIP_ROOT_DIR}") if(USE_HIPCXX) message(STATUS " HIP compiler : ${CMAKE_HIP_COMPILER}") message(STATUS " HIP compiler version : ${CMAKE_HIP_COMPILER_VERSION}") string(STRIP "${CMAKE_HIP_FLAGS}" CMAKE_HIP_FLAGS_STRIP) message(STATUS " HIP flags : ${CMAKE_HIP_FLAGS_STRIP}") else() message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") endif() if(HIP_COMPILER STREQUAL "nvcc") string(REPLACE ";" " " HIP_NVCC_FLAGS_STRIP "${HIP_NVCC_FLAGS}") string(STRIP "${HIP_NVCC_FLAGS_STRIP}" HIP_NVCC_FLAGS_STRIP) string(REPLACE ";" " " HIP_CPP_CONFIG_FLAGS_STRIP "${HIP_CPP_CONFIG_FLAGS}") string(STRIP "${HIP_CPP_CONFIG_FLAGS_STRIP}" HIP_CPP_CONFIG_FLAGS_STRIP) message(STATUS " HIP flags : ${HIP_CPP_CONFIG_FLAGS_STRIP}") message(STATUS " NVCC flags : ${HIP_NVCC_FLAGS_STRIP}") endif() message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") if(USE_HIPCXX) message(STATUS " Device targets : ${CMAKE_HIP_ARCHITECTURES}") elseif(HIP_COMPILER STREQUAL "clang") message(STATUS " Device targets : ${AMDGPU_TARGETS}") else() message(STATUS " Device targets : ${NVGPU_TARGETS}") endif() message(STATUS "") message(STATUS " DEPENDENCIES_FORCE_DOWNLOAD : ${DEPENDENCIES_FORCE_DOWNLOAD}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}") endfunction() hipCUB-rocm-6.4.3/cmake/VerifyCompiler.cmake000066400000000000000000000041271502260333500206200ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # HIP supports config-mode search for both NVIDIA and AMD platforms as of 5.7 find_package(hip REQUIRED CONFIG PATHS "${ROCM_ROOT}/lib/cmake/hip") if(HIP_COMPILER STREQUAL "nvcc") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") include(SetupNVCC) else() message(WARNING "On CUDA platform 'g++' is recommended C++ compiler.") endif() elseif(HIP_COMPILER STREQUAL "clang") if(USE_HIPCXX) if(NOT (CMAKE_HIP_COMPILER MATCHES ".*hipcc$" OR CMAKE_HIP_COMPILER MATCHES ".*clang\\+\\+")) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as HIP compiler.") endif() else() if(NOT (CMAKE_CXX_COMPILER MATCHES ".*hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") endif() endif() else() message(FATAL_ERROR "HIP_COMPILER must be `clang` (AMD ROCm platform)") endif() hipCUB-rocm-6.4.3/conanfile.py000066400000000000000000000011301502260333500160760ustar00rootroot00000000000000# Copyright 2021 Advanced Micro Devices, Inc. # This conanfile is used to install development requirements, # e.g. # conan install -o clients=True -if build/deps . from conans import ConanFile, CMake class ConanPkgReqs(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake_find_package" options = { "shared": [True, False], "clients": [True, False], } default_options = { "shared": True, "clients": False, } def requirements(self): if self.options.clients: self.requires("gtest/1.11.0") hipCUB-rocm-6.4.3/docs/000077500000000000000000000000001502260333500145235ustar00rootroot00000000000000hipCUB-rocm-6.4.3/docs/.gitignore000066400000000000000000000001231502260333500165070ustar00rootroot00000000000000/_build/ /_doxygen /doxygen/html /doxygen/xml /doxygen/hipCUB.tag /sphinx/_toc.yml hipCUB-rocm-6.4.3/docs/api-reference/000077500000000000000000000000001502260333500172305ustar00rootroot00000000000000hipCUB-rocm-6.4.3/docs/api-reference/data-type-support.rst000066400000000000000000000014741502260333500233720ustar00rootroot00000000000000.. meta:: :description: hipcub API library data type support :keywords: hipcub, ROCm, API library, API reference, data type, support .. _data-type-support: ****************************************** Data type support ****************************************** hipCUB supports the following data types on both ROCm and CUDA: * ``int8`` * ``int16`` * ``int32`` * ``float32`` * ``float64`` ``float8``, ``bfloat8``, and ``tensorfloat32`` are not supported by hipCUB on neither ROCm nor CUDA. The NVIDIA back end does not support ``float16`` nor ``bfloat16`` with the following API calls: ``block_adjacent_difference``, ``device_adjacent_difference``, ``device_reduce``, ``device_scan``, ``device_segmented_reduce`` and ``device_select``. The NVIDIA backend also does not support ``bfloat16`` with ``device_histogram``. hipCUB-rocm-6.4.3/docs/conf.py000066400000000000000000000022021502260333500160160ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html import re from rocm_docs import ROCmDocs with open('../CMakeLists.txt', encoding='utf-8') as f: match = re.search(r'set\(VERSION_STRING\s+\"?([0-9.]+)[^0-9.]+', f.read()) if not match: raise ValueError("VERSION not found!") version_number = match[1] left_nav_title = f"hipCUB {version_number} Documentation" # for PDF output on Read the Docs project = "hipCUB Documentation" author = "Advanced Micro Devices, Inc." copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." version = version_number release = version_number external_toc_path = "./sphinx/_toc.yml" docs_core = ROCmDocs(left_nav_title) docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.enable_api_reference() docs_core.setup() external_projects_current_project = "hipcub" for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) hipCUB-rocm-6.4.3/docs/doxygen/000077500000000000000000000000001502260333500162005ustar00rootroot00000000000000hipCUB-rocm-6.4.3/docs/doxygen/Doxyfile000066400000000000000000003174031502260333500177160ustar00rootroot00000000000000# Doxyfile 1.8.11 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = hipCUB # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = YES # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = ../../hipcub/include # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = YES # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../hipcub/include/hipcub # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, # *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = */detail/*,*/backend/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = detail::* # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse-libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = NO # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = ../_doxygen/header.html # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = ../_doxygen/footer.html # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = ../_doxygen/stylesheet.css # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = ../_doxygen/extra_stylesheet.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /